SoFunction
Updated on 2024-11-10

Apache tika to achieve a variety of document content parsing sample code

Apache tika implements a variety of document content parsing

1. Dependence

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="/POM/4.0.0"
         xmlns:xsi="http:///2001/XMLSchema-instance"
         xsi:schemaLocation="/POM/4.0.0 /xsd/maven-4.0.">
    <modelVersion>4.0.0</modelVersion>
    <groupId></groupId>
    <artifactId>TikaResouce</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <>8</>
        <>8</>
        <>UTF-8</>
    </properties>
    <parent>
        <groupId></groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.7.0</version>
    </parent>
        <dependencyManagement>
            <dependencies>
                <dependency>
                    <groupId></groupId>
                    <artifactId>tika-bom</artifactId>
                    <version>2.8.0</version>
                    <type>pom</type>
                    <scope>import</scope>
                </dependency>
            </dependencies>
        </dependencyManagement>
    <dependencies>
        <dependency>
            <groupId></groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>commons-fileupload</groupId>
            <artifactId>commons-fileupload</artifactId>
            <version>1.4</version>
        </dependency>
        <dependency>
            <groupId></groupId>
            <artifactId>tika-core</artifactId>
        </dependency>
        <dependency>
            <groupId></groupId>
            <artifactId>tika-parsers-standard-package</artifactId>
        </dependency>
    </dependencies>
</project>

2、Configuration file

Create a new file

<?xml version="1.0" encoding="UTF-8"?>
<properties>
    <encodingDetectors>
        <encodingDetector class="">
            <params>
                <param name="markLimit" type="int">64000</param>
            </params>
        </encodingDetector>
        <encodingDetector class="">
            <params>
                <param name="markLimit" type="int">64001</param>
            </params>
        </encodingDetector>
        <encodingDetector class=".Icu4jEncodingDetector">
            <params>
                <param name="markLimit" type="int">64002</param>
            </params>
        </encodingDetector>
    </encodingDetectors>
</properties>

3. Configuration classes

package ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
/**
 * tika configuration class
 */
@Configuration
public class MyTikaConfig {
    @Autowired
    private ResourceLoader resourceLoader;
    @Bean
    public Tika tika() throws TikaException, IOException, SAXException {
        Resource resource = ("classpath:");
        InputStream inputStream = ();
        TikaConfig config = new TikaConfig(inputStream);
        Detector detector = ();
        Parser autoDetectParser = new AutoDetectParser(config);
        return new Tika(detector, autoDetectParser);
    }
}

controller

package ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
@RestController
@RequestMapping("/tika")
public class TikaController {
    @Resource
    private Tika tika;
    @PostMapping("/pdf")
    public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException {
        InputStream inputStream = ();
        String s = (inputStream);
        (s);
    }
}

to this article on Apache tika to achieve a variety of document content parsing article is introduced to this , more related Apache tika document content parsing content please search for my previous articles or continue to browse the following related articles I hope that you will support me in the future !