快捷導(dǎo)航

通過Java實(shí)現(xiàn)中文分詞與文本關(guān)鍵詞提取

更新時間：2023年06月12日 14:14:09 作者：歐內(nèi)的手好汗

這篇文章主要為大家詳細(xì)介紹了如何利用Java實(shí)現(xiàn)中文分詞以及文本關(guān)鍵詞提取功能，文中的示例代碼講解詳細(xì)，感興趣的小伙伴可以跟隨小編一起學(xué)習(xí)學(xué)習(xí)

1、引入依賴

ik用于分詞，commons-io用來讀取文件內(nèi)容（我懶）

<dependency>
    <groupId>com.janeluo</groupId>
    <artifactId>ikanalyzer</artifactId>
    <version>2012_u6</version>
</dependency>
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.8.0</version>
</dependency>

注意：如果項(xiàng)目使用了ElasticSearch，可能會出現(xiàn)沖突，需根據(jù)你的情況手動排除，如下

<dependency>
    <groupId>com.janeluo</groupId>
    <artifactId>ikanalyzer</artifactId>
    <version>2012_u6</version>
    <exclusions>
        <exclusion>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
        </exclusion>
        <exclusion>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
        </exclusion>
        <exclusion>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
        </exclusion>
    </exclusions>
</dependency>

2、創(chuàng)建自己的詞典

創(chuàng)建文件，在里面輸入自己想要擴(kuò)充的詞語，放到resources中，命名如“keywords.dic”

3、創(chuàng)建分詞工具類

package com.iherb.user.util;
import org.apache.commons.io.IOUtils;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.dic.Dictionary;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.*;
public class KeywordUtil {
    Configuration cfg;
    List<String> expandWords = new ArrayList<>();
    /**
     * 每個詞的最小長度
     */
    private static final int MIN_LEN = 2;
    KeywordUtil() {
        cfg = DefaultConfig.getInstance();
        cfg.setUseSmart(true); //設(shè)置useSmart標(biāo)志位 true-智能切分 false-細(xì)粒度切分
        boolean flag = loadDictionaries("keywords.dic");
        if (!flag) {
            throw new RuntimeException("讀取失敗");
        }
        Dictionary.initial(cfg);
        Dictionary.getSingleton().addWords(expandWords); //詞典中加入自定義單詞
    }
    /**
     * 加載自定義詞典，若無想要添加的詞則無需調(diào)用，使用默認(rèn)的詞典
     * @param filenames
     * @return
     */
    private boolean loadDictionaries(String... filenames) {
        try {
            for (String filename : filenames) {
                expandWords.addAll(
                    IOUtils.readLines(
                        KeywordUtil.class.getClassLoader().getResourceAsStream(filename),
                        StandardCharsets.UTF_8
                    )
                );
            }
            return true;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return false;
    }
    /**
     * 提取詞語，結(jié)果將按頻率排序
     * @param text 待提取的文本
     * @return 提取出的詞
     */
    public List<String> extract(String text) {
        StringReader reader = new StringReader(text);
        IKSegmenter ikSegmenter = new IKSegmenter(reader, cfg);
        Lexeme lex;
        Map<String, Integer> countMap = new HashMap<>();
        try {
            while ((lex = ikSegmenter.next()) != null) {
                String word = lex.getLexemeText();
                if (word.length() >= MIN_LEN) { //取出的詞至少#{MIN_LEN}個字
                    countMap.put(word, countMap.getOrDefault(word, 0) + 1);
                }
            }
            List<String> result = new ArrayList<>(countMap.keySet());
            //根據(jù)詞出現(xiàn)頻率從大到小排序
            result.sort((w1, w2) -> countMap.get(w2) - countMap.get(w1));
            return result;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return Collections.emptyList();
    }
    /**
     * 提取存在于我擴(kuò)充詞典的詞
     * @param num 需要提取的詞個數(shù)
     * @return
     */
    public List<String> getKeywords(String text, Integer num) {
        List<String> words = extract(text);
        List<String> result = new ArrayList<>();
        int count = 0;
        for (String word : words) {
            if (expandWords.contains(word)) {
                result.add(word);
                if (++count == num) {
                    break;
                }
            }
        }
        return result;
    }
    public static void main(String[] args) {
        String text = "哈哈無花果翠云草酢漿草是什么，。我是帥哥666無花果真好吃還有北沙參穿心蓮翠云草，草豆蔻和蟬蛻酢漿草也不錯的";
        KeywordUtil keywordUtil = new KeywordUtil();
        List<String> keywords = keywordUtil.getKeywords(text, 5);
        keywords.forEach(System.out::println);
    }
}