①Cos_Main
package NLP_Cos;
import java.io.*;
public class CosMain {
public static void main(String[] args) throws Exception {
//第一步,预处理主要是进行分词和去停用词,分词。
//第二步,列出所有的词。
//公共词
//第三步,计算词频,写出词频向量。
// 执行
Cos_Frame gui = new Cos_Frame();
// 将结果保存到文件out.txt里
File f=new File("out.txt");
Cos_FileOperation.clearInfoForFile("out.txt");
f.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(f);
PrintStream printStream = new PrintStream(fileOutputStream);
System.setOut(printStream);
System.out.println("The fact is:");
}
}
②Cos_Alogrithm
package NLP_Cos;
import java.util.*;
public class Cos_Alogrithm {
//数据结构解析:<单词,二维数组>,其中单词表示公共词,
// 二维数组一维度表示句子一的向量,另一维度表示句子二的向量
Map<String, int[]> vectorMap = new HashMap<String, int[]>();
int[] tempArray = null;
public Cos_Alogrithm(String[] string1, String[] string2) {
List<String> list1 = java.util.Arrays.asList(string1);
for (String character1 :list1) {
if (vectorMap.containsKey(character1)) {
vectorMap.get(character1)[0]++;
} else {
tempArray = new int[2];
tempArray[0] = 1;
tempArray[1] = 0;
vectorMap.put(character1, tempArray);
}
}
List<String> list2 = java.util.Arrays.asList(string2);
for (String character2 : list2) {
if (vectorMap.containsKey(character2)) {
vectorMap.get(character2)[1]++;
} else {
tempArray = new int[2];
tempArray[0] = 0;
tempArray[1] = 1;
vectorMap.put(character2, tempArray);
}
}
for (Map.Entry<String, int[]> entry : vectorMap.entrySet()) {
System.out.println("Key = " + entry.getKey() + ", Value = " + entry.getValue()[0] +","+entry.getValue()[1]);
}
}
// 求余弦相似度
public double sim() {
double result = 0;
result = pointMulti(vectorMap) / sqrtMulti(vectorMap);
return result;
}
private double sqrtMulti(Map<String, int[]> vectorMap2) {
double result = 0;
result = squares(vectorMap2);
result = Math.sqrt(result);
return result;
}
// 求平方和
private double squares(Map<String, int[]> vectorMap2) {
double result1 = 0;
double result2 = 0;
Set<String> keySet = vectorMap2.keySet();
for (String character : keySet) {
int temp[] = vectorMap2.get(character);
result1 += (temp[0] * temp[0]);
result2 += (temp[1] * temp[1]);
}
return result1 * result2;
}
// 点乘法
private double pointMulti(Map<String, int[]> vectorMap2) {
double result = 0;
Set<String> keySet = vectorMap2.keySet();
for (String character : keySet) {
int temp[] = vectorMap2.get(character);
result += (temp[0] * temp[1]);
}
return result;
}
}
③Cos_FileOperation
package NLP_Cos;
import java.io.*;
import java.util.Scanner;
public class Cos_FileOperation {
public static String[] filework(String filename) throws IOException {
//1.1文件1去行
java.io.File file1_1 = new java.io.File("actest1_1.txt");
clearInfoForFile("actest1_1.txt");
java.io.PrintWriter output1_1 = new java.io.PrintWriter(file1_1);
BufferedReader br1_1 = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
String str1_1;
while((str1_1=br1_1.readLine())!=null) {
String s1_1 =str1_1;
output1_1.print(s1_1);
//output1.print(" ");
}
output1_1.close();
br1_1.close();
//1.2 文件test1操作
String[] testString1_1;
java.io.File cin1_1 = new java.io.File("actest1_1.txt");
Scanner input1_1 = new Scanner(cin1_1);
String line1_1 = input1_1.nextLine();
line1_1 = line1_1.toLowerCase();
testString1_1 = line1_1.split("[ ]"); //去掉空格
input1_1.close();
//1.3去除停用词
for(int i=0; i<testString1_1.length; i++) {
String temp1_1 = testString1_1[i]; //存放取出的单个单词
char getchar1_1 = temp1_1.toCharArray()[0]; //取出单词首字母,以便于打开对应文件
java.io.File tempfile1_1 = new java.io.File(getchar1_1+"_StopWord.txt");
Scanner tempinput1_1= new Scanner(tempfile1_1);
while(tempinput1_1.hasNext()) {
String templine1 = tempinput1_1.nextLine();
if(templine1.equals(temp1_1)) {
testString1_1[i] = "#";
break;
}
}
tempinput1_1.close();
}
//1.4去标点符号
for(int j=0; j<testString1_1.length; j++) {
char temp[] = (testString1_1[j]).toCharArray();
StringBuilder stringBuilder1_1 = new StringBuilder();
stringBuilder1_1.append(testString1_1[j]);
if(temp[(stringBuilder1_1.length()-1)]== ',' || temp[stringBuilder1_1.length()-1]== '.' || temp[stringBuilder1_1.length()-1]== '(' || temp[stringBuilder1_1.length()-1]== ')' ) {
stringBuilder1_1.deleteCharAt(stringBuilder1_1.length()-1);
testString1_1[j] = stringBuilder1_1.toString();
}
}
//1.5去除标记符号“#”,并将整合好的字符串重新录入文件“actest1_2”中
StringBuilder stringBuilder1_2 = new StringBuilder();
for(int i=0; i<testString1_1.length; i++) {
if(!testString1_1[i].equals("#")) {
stringBuilder1_2.append(testString1_1[i]);
stringBuilder1_2.append(' ');
}
}
String str1_2 = stringBuilder1_2.toString();
java.io.File file1_2 = new java.io.File("actest1_2.txt");
clearInfoForFile("actest1_2");
java.io.PrintWriter output1_2 = new java.io.PrintWriter(file1_2);
output1_2.print(str1_2);
output1_2.close();
//1.6读取文件“actest1_2”,读取出 testString1_2
String[] testString1_2;
java.io.File cin1_2 = new java.io.File("actest1_2.txt");
Scanner input1_2 = new Scanner(cin1_2);
String line1_2 = input1_2.nextLine();
testString1_2 = line1_2.split("[ ]"); //去掉空格
input1_2.close();
return testString1_2;
}
//清空文档中已有内容
public static void clearInfoForFile(String fileName) {
File file =new File(fileName);
try {
if(!file.exists()) {
file.createNewFile();
}
FileWriter fileWriter =new FileWriter(file);
fileWriter.write("");
fileWriter.flush();
fileWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
④Cos_ChooseFile
package NLP_Cos;
import java.io.File;
import java.util.Scanner;
import javax.swing.JFileChooser;
public class Cos_ChooseFile {
public static String readFile() throws Exception {
// TODO Auto-generated method stub
String filename = null;
JFileChooser jfc=new JFileChooser();
if(jfc.showOpenDialog(null)==JFileChooser.APPROVE_OPTION){
File file=jfc.getSelectedFile();
Scanner input=new Scanner(file);
filename = file.getName();
input.close();
//System.out.println("Successfully Opened"+filename);
return filename;
}
else
System.out.println("No file is selected!");
return filename;
}
}
⑤Cos_Frame
package NLP_Cos;
import java.awt.*;
import java.io.File;
import javax.swing.*;
public class Cos_Frame extends JFrame{
String[] str1, str2;
public Cos_Frame() {
//1.创建JFrame
this.setSize(600, 450);
//2.创建布局
GridLayout layout1 = new GridLayout(2, 1);
//3.将布局应用于容器上
this.setLayout(layout1);
//4.创建面板
JPanel p1 = new JPanel();
JPanel p2 = new JPanel();
//5.将面板放到Frame上
this.add(p1);
this.add(p2);
//6.创建组件
JLabel title = new JLabel("计算文本相似度",JLabel.CENTER);
title.setForeground(Color.BLACK);
Font font = new Font("宋体", Font.BOLD, 18);
title.setFont(font);
JButton choose = new JButton("Choose Files");
choose.addActionListener(new doActionListener());
JButton calculate = new JButton("Calculate");
//匿名类
calculate.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
Cos_Alogrithm similarity = new Cos_Alogrithm(str1, str2);
System.out.println(similarity.sim());
dispose();
File f=new File("out.txt");
try {
new Cos_DisplayFact(f);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
});
//7.将组件添加到容器上
p1.add(title);
p2.add(choose);
p2.add(calculate);
//8.显示窗口
this.setVisible(true);
this.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
}
//内部类
class doActionListener implements ActionListener{
@Override
public void actionPerformed(ActionEvent e) {
String filename1, filename2;
try {
filename1 = Cos_ChooseFile.readFile();
filename2 = Cos_ChooseFile.readFile();
str1 = Cos_FileOperation.filework(filename1);
str2 = Cos_FileOperation.filework(filename2);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}
}
⑥Cos_DisplayFact
package NLP_Cos;
import java.io.*;
import javax.swing.*;
public class Cos_DisplayFact extends JFrame{
JTextArea textArea = null;
JScrollPane scrollPane = null;
public Cos_DisplayFact(File file) throws Exception {
this.setVisible(true);
this.setSize(800,650);
this.setDefaultCloseOperation(EXIT_ON_CLOSE);
textArea = new JTextArea();
scrollPane = new JScrollPane(textArea);
textArea.setText(null);
BufferedReader buf = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String str = null;
while( (str = buf.readLine()) != null ){
textArea.append(str);
textArea.append("\r\n");
}
add(scrollPane);
validate();
}
}
本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。