Apache tika 實(shí)現(xiàn)各種文檔內(nèi)容解析示例代碼
Apache tika 實(shí)現(xiàn)各種文檔內(nèi)容解析
1、依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.js</groupId>
<artifactId>TikaResouce</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.0</version>
</parent>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-bom</artifactId>
<version>2.8.0</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
</dependency>
</dependencies>
</project>2、配置文件
新建一個 tika-config.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<encodingDetectors>
<encodingDetector class="org.apache.tika.parser.html.HtmlEncodingDetector">
<params>
<param name="markLimit" type="int">64000</param>
</params>
</encodingDetector>
<encodingDetector class="org.apache.tika.parser.txt.UniversalEncodingDetector">
<params>
<param name="markLimit" type="int">64001</param>
</params>
</encodingDetector>
<encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
<params>
<param name="markLimit" type="int">64002</param>
</params>
</encodingDetector>
</encodingDetectors>
</properties>3、配置類
package cn.js.config;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
import org.xml.sax.SAXException;
/**
* tika配置類
*/
@Configuration
public class MyTikaConfig {
@Autowired
private ResourceLoader resourceLoader;
@Bean
public Tika tika() throws TikaException, IOException, SAXException {
Resource resource = resourceLoader.getResource("classpath:tika-config.xml");
InputStream inputStream = resource.getInputStream();
TikaConfig config = new TikaConfig(inputStream);
Detector detector = config.getDetector();
Parser autoDetectParser = new AutoDetectParser(config);
return new Tika(detector, autoDetectParser);
}
}controller
package cn.js.controller;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.springframework.http.HttpRequest;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import java.io.IOException;
import java.io.InputStream;
@RestController
@RequestMapping("/tika")
public class TikaController {
@Resource
private Tika tika;
@PostMapping("/pdf")
public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException {
InputStream inputStream = file.getInputStream();
String s = tika.parseToString(inputStream);
System.out.println(s);
}
}到此這篇關(guān)于Apache tika 實(shí)現(xiàn)各種文檔內(nèi)容解析的文章就介紹到這了,更多相關(guān)Apache tika 文檔內(nèi)容解析內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
lanmp(Linux Apache Nginx Mysql Php) 的安裝配置
lanmp(Linux Apache Nginx Mysql Php) 的安裝配置,需要的朋友可以參考下。2010-11-11
阿里云云服務(wù)器Linux系統(tǒng)掛載數(shù)據(jù)盤圖文教程
這篇文章主要介紹了阿里云云服務(wù)器Linux系統(tǒng)掛載數(shù)據(jù)盤圖文教程,阿里云服務(wù)器一般需要購買額外的數(shù)據(jù)盤,本文就講解如何掛載使用額外的數(shù)據(jù)盤,需要的朋友可以參考下2014-09-09
解決navicat連接不上linux服務(wù)器上的mysql問題
這篇文章主要介紹了navicat連接不上linux服務(wù)器上的mysql的解決辦法,非常不錯,具有一定的參考借鑒價值,需要的朋友可以參考下2019-10-10
Linux下設(shè)置防火墻白名單(RHEL 6和CentOS 7)的步驟
下面小編就為大家?guī)硪黄狶inux下設(shè)置防火墻白名單(RHEL 6和CentOS 7)的步驟。小編覺得挺不錯的,現(xiàn)在就分享給大家,也給大家做個參考。一起跟隨小編過來看看吧2016-11-11
戴爾Dell?R630配置raid?安裝centos系統(tǒng)
這篇文章主要介紹了戴爾R630服務(wù)器配置raid后安裝centos7.9的方法,需要的朋友可以參考下2024-03-03

