Apache tika implements a variety of document content parsing
1. Dependence
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="/POM/4.0.0" xmlns:xsi="http:///2001/XMLSchema-instance" xsi:schemaLocation="/POM/4.0.0 /xsd/maven-4.0."> <modelVersion>4.0.0</modelVersion> <groupId></groupId> <artifactId>TikaResouce</artifactId> <version>1.0-SNAPSHOT</version> <properties> <>8</> <>8</> <>UTF-8</> </properties> <parent> <groupId></groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.7.0</version> </parent> <dependencyManagement> <dependencies> <dependency> <groupId></groupId> <artifactId>tika-bom</artifactId> <version>2.8.0</version> <type>pom</type> <scope>import</scope> </dependency> </dependencies> </dependencyManagement> <dependencies> <dependency> <groupId></groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>commons-fileupload</groupId> <artifactId>commons-fileupload</artifactId> <version>1.4</version> </dependency> <dependency> <groupId></groupId> <artifactId>tika-core</artifactId> </dependency> <dependency> <groupId></groupId> <artifactId>tika-parsers-standard-package</artifactId> </dependency> </dependencies> </project>
2、Configuration file
Create a new file
<?xml version="1.0" encoding="UTF-8"?> <properties> <encodingDetectors> <encodingDetector class=""> <params> <param name="markLimit" type="int">64000</param> </params> </encodingDetector> <encodingDetector class=""> <params> <param name="markLimit" type="int">64001</param> </params> </encodingDetector> <encodingDetector class=".Icu4jEncodingDetector"> <params> <param name="markLimit" type="int">64002</param> </params> </encodingDetector> </encodingDetectors> </properties>
3. Configuration classes
package ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; /** * tika configuration class */ @Configuration public class MyTikaConfig { @Autowired private ResourceLoader resourceLoader; @Bean public Tika tika() throws TikaException, IOException, SAXException { Resource resource = ("classpath:"); InputStream inputStream = (); TikaConfig config = new TikaConfig(inputStream); Detector detector = (); Parser autoDetectParser = new AutoDetectParser(config); return new Tika(detector, autoDetectParser); } }
controller
package ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; @RestController @RequestMapping("/tika") public class TikaController { @Resource private Tika tika; @PostMapping("/pdf") public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException { InputStream inputStream = (); String s = (inputStream); (s); } }
to this article on Apache tika to achieve a variety of document content parsing article is introduced to this , more related Apache tika document content parsing content please search for my previous articles or continue to browse the following related articles I hope that you will support me in the future !