#include "stdafx.h"
#include "orsci.h"
#include "orsciDM.h"
using namespace orsci;
using namespace dm;
bool Demo_FileErazeStopWord()
{
cout << " orsci--tool:文本中去除停用词 ..." << endl;
cout << " http://www.orsci.cn" << endl;
cout << "输入:(1)样本文件A:存储分词后文档或样本文档,每行代表一篇文档或一个段落,各词使用空格或TAB键分隔;" << endl;
cout << " (2)文件A左侧保留列数(例如1):注:样本最左侧列可能作为分类组号,例如0,1,2等,所以可以保留。" << endl;
cout << " (3)停用词文件B:存储停用词表,每行为一个停用词!" << endl;
cout << "输出:自动生成文档A加后缀_stopword.txt文档!" << endl;
cout << "说明:允许文件A保留左侧若干列,比如作为分类标记。" << endl;
cout << "文件A格式举例“1 好好 学习 数据 挖掘”,其中1作为分类号。" << endl;
cout << endl;
cout << "[1]请输入样本文件:";
string sampleFileName;
while (sampleFileName == "") getline(cin, sampleFileName);
int mKeepColCount = 0;
cout << "[2]请输入样本文件左侧保留的列数(例如1):";
cin >> mKeepColCount;
cout << "[3]请输入停用词文件:";
string stopwordFileName;
while (stopwordFileName == "") getline(cin, stopwordFileName);
string mDesFileName = sampleFileName + "_stopword.txt";
cout << endl;
if (jw::FileExist(sampleFileName) == false) {cout << "样本文件不存在:" << sampleFileName << endl; return false;}
if (jw::FileExist(stopwordFileName) == false) {cout << "停用词文件不存在:" << stopwordFileName << endl; return false;}
cout << "[Confirm]即将进行停用词去除,目标文件:" << mDesFileName << endl;
cout << "确认继续(输入1--继续,输入0停止!):";
int mmm;
cin >> mmm;
if (mmm != 1) return false;
grid_wstring_Horizon g; //g0用于读取左侧保留列,g1用于读取右侧数据区。
if (mKeepColCount <= 0) //不需要保留信息
{
vwstring mStopList;
mStopList.loadFromTextFile(stopwordFileName, true);
g.loadFromTextFile(sampleFileName, true, true);
g.erazeStopWord(mStopList);
g.saveToTextFile(mDesFileName, L"\t", true, true);
}
else
{
vwstring mStopList;
mStopList.loadFromTextFile(stopwordFileName, true);
g.loadFromTextFile(sampleFileName, true, true);
grid_wstring mKeep;
vint mKeepColIndex = span(0, mKeepColCount - 1);
mKeep = g.subgrid(vint(), mKeepColIndex); //保存需要的列数
g.delete_col(mKeepColIndex); //暂时去除。
g.erazeStopWord(mStopList);
grid_wstring_Horizon desg = g.merge_insert_col(0, mKeep);
desg.saveToTextFile(mDesFileName, L"\t", true, true);
}
cout << "...OK!" << endl;
return true;
}
int main(int argc, _TCHAR* argv[])
{
Demo_FileErazeStopWord();
cout << endl;
cout << "press any key to stop..." << endl;
char pp;
cin >> pp;
return 1;
}
|