利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-17 22:15:30  所属栏目:大数据  来源:网络整理 
            导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d
                
                
                
            | 
 分词处理,具体看实现 Chunk.javapackage grid.text.participle;
import grid.text.dic.CnDictionary;
import java.util.List;
public class Chunk implements Comparable<Chunk> {
    private List<String> list;
    private int len = 0;
    private double avg = 0;
    private double variance = 0;
    public Chunk(List<String> list) {
        this.list = list;
        init();
    }
    private void init() {
        for (String s : list) {
            len += s.length();
        }
        avg = (double) len / list.size();
        for (String s : list) {
            variance += Math.pow(avg - s.length(),2);
        }
        variance = Math.sqrt(variance);
    }
    public int getLen() {
        return len;
    }
    public double getAvg() {
        return avg;
    }
    public double getVariance() {
        return variance;
    }
    public String getHead() {
        if (null == list || list.isEmpty()) {
            return "";
        }
        return list.get(0);
    }
    private int compareDouble(double d1,double d2) {
        if (d1 - d2 < -0.0000001D) {
            return 1;
        } else if (d1 - d2 > 0.0000001D) {
            return -1;
        }
        return 0;
    }
    @Override
    public int compareTo(Chunk o) {
        if (len != o.len) {
            return o.len - len;
        }
        int d = compareDouble(avg,o.avg);
        if (0 != d) {
            return d;
        }
        d = compareDouble(variance,o.variance);
        if (0 != d) {
            return d;
        }
        CnDictionary dictionary = CnDictionary.Instance();
        double rateSrc = 0,rateDest = 0;
        for (String s : list) {
            if (1 == s.length()) {
                rateSrc += dictionary.rate(s.charAt(0));
            }
        }
        for (String s : o.list) {
            if (1 == s.length()) {
                rateDest += dictionary.rate(s.charAt(0));
            }
        }
        return compareDouble(rateSrc,rateDest);
    }
    public String toString() {
        return list.toString();
    }
}ChunkStream.javapackage grid.text.participle;
import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ChunkStream {
    /** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */
    private static final int MAX_WORD_LEN = 7;
    /** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */
    private static final int PREDICT_LEVEL = 3;
    private static CnDictionary dictionary = CnDictionary.Instance();
    public String next(String text,int off) {
        Tree<String> root = new Tree<String>("ROOT");
        recurse(root,off,text,0);
        List<Node<String>> list = root.getLeaves();
        List<Chunk> chunkList = new ArrayList<Chunk>();
        for (Node<String> node : list) {
            chunkList.add(new Chunk(node.getBranchPath()));
        }
        Collections.sort(chunkList);
        return chunkList.get(0).getHead();
    }
    private void recurse(Node<String> node,String text,int predictDeep) {
        int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
                : MAX_WORD_LEN;
        while (predictDeep < PREDICT_LEVEL) {
            if (len < 1) {
                return;
            }
            String s = text.substring(off,off + len);
            if (len < 2) {
                if (!TextUtils.isCnLetter(text.charAt(off))) {
                    break;
                }
                recurse(node.add(s),off + 1,predictDeep + 1);
            } else if (dictionary.contains(s)) {
                recurse(node.add(s),off + s.length(),predictDeep + 1);
            }
            len--;
        }
    }
}MechanicalParticiple.javapackage grid.text.participle;
import grid.common.TextUtils;
import java.util.Vector;
public class MechanicalParticiple {
    public Vector<String> partition(String document) {
        Vector<String> vector = new Vector<String>();
        final int docLen = document.length();
        int off = 0;
        char c;
        String seg = "";
        ChunkStream stream = new ChunkStream();
        while (off < docLen) {
            c = document.charAt(off);
            if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
                seg += c;
                off++;
            } else if (TextUtils.isCnLetter(c)) {
                if (!TextUtils.isBlank(seg)) {
                    vector.add(seg);
                    seg = "";
                }
                String word = stream.next(document,off);
                if (!TextUtils.isBlank(word)) {
                    vector.add(word);
                    off += word.length();
                }
            } else {
                if (!TextUtils.isBlank(seg)) {
                    vector.add(seg);
                    seg = "";
                }
                /** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */
                // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }
                off++;
            }
        }
        if (!TextUtils.isBlank(seg)) {
            vector.add(seg);
        }
        return vector;
    }
}selector(编辑:清远站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! | 


