E035

orsci-dm文本挖掘中常用潜在语义分析技术(LSI),主要思想是基于奇异值分解,进行降维。下面提供演示如何降维操作。

 

#include "stdafx.h"

#include "orsci.h"

using namespace orsci;
using namespace orsci::vmt;



int main(int argc, _TCHAR* argv[])
{
cout << " orsci: PCA -- based SVD " << endl;
cout << endl;

cout << " orsci:Principal component analysis (PCA)" << endl
<< " --- http://www.orsci.cn" << endl;
cout << "Part1:进行主成分分析系数矩阵计算!" << endl;
mdouble X;
//randn(13, 5, X);
X = "148 41 72 78;"
"139 34 71 76;"
"160 49 77 86;"
"149 36 67 79;"
"159 45 80 86;"
"142 31 66 76;"
"153 43 76 83;"
"150 43 77 79;"
"151 42 77 80;"
"139 31 68 74;"
"140 29 64 74;"
"161 47 78 84;"
"158 49 78 83;"
"140 33 67 77;"
"137 31 66 73;"
"152 35 73 79;"
"149 47 82 79;"
"145 35 70 77;"
"160 47 74 87;"
"156 44 78 85;"
"151 42 73 82;"
"147 38 73 78;"
"157 39 68 80;"
"147 30 65 75;"
"157 48 80 88;"
"151 36 74 80;"
"144 36 68 76;"
"141 30 67 76;"
"139 32 68 73;"
"148 38 70 78;";

cout << "X = " << endl;
cout << X << endl;
cout << "sample count = " << X.rowCount() << endl;
mdouble coeff_out, score_out;
coldouble latent_out;
coldouble tsquared_out;

rowdouble mu;
rowdouble mstddev;

if (true)
{
const bool mFlag = princomp_svd(X, coeff_out, score_out, latent_out, tsquared_out, mu, 2);
cout << "成功标记:" << mFlag << endl;
}
else
{
coldouble sampleWeight;
sampleWeight.Resize(X.rowCount());
sampleWeight.fill(1, 1); //这里模拟样本权重,按照顺序递增权重模拟。
sampleWeight /= sampleWeight.maxv();

rowdouble variableWeight;
variableWeight.Resize(X.colCount());
variableWeight.fill(1, 1); //这里模拟属性权重,按照顺序递增权重模拟。
variableWeight /= variableWeight.maxv();

const bool mFlag = princomp_svd_Weighted(X, sampleWeight, variableWeight, coeff_out, score_out, latent_out, tsquared_out, mu, 2);
cout << "成功标记:" << mFlag << endl;
}

cout << "score=" << endl;
cout << score_out << endl;
cout << "coeff=" << endl;
cout << coeff_out << endl;

cout << "latent=" << endl;
cout << latent_out << endl;
cout << "tsquared=" << endl;
cout<< tsquared_out << endl;

cout << "Part2:对新数据进行映射计算!" << endl;
cout << princomp_svd_test(X, mu, coeff_out) << endl;
cout << endl;
cout << "press any key to stop..." << endl;
char pp;
cin >> pp;
return 0;
}


输出

(一)运行过程

orsci: PCA -- based SVD

orsci:Principal component analysis (PCA)
--- http://www.orsci.cn
Part1:进行主成分分析系数矩阵计算!
X =
rowCount = 30 colCount = 4
148 41 72 78
139 34 71 76
160 49 77 86
149 36 67 79
159 45 80 86
142 31 66 76
153 43 76 83
150 43 77 79
151 42 77 80
139 31 68 74
140 29 64 74
161 47 78 84
158 49 78 83
140 33 67 77
137 31 66 73
152 35 73 79
149 47 82 79
145 35 70 77
160 47 74 87
156 44 78 85
151 42 73 82
147 38 73 78
157 39 68 80
147 30 65 75
157 48 80 88
151 36 74 80
144 36 68 76
141 30 67 76
139 32 68 73
148 38 70 78

sample count = 30
7.7565
4.21157

成功标记:1
score=
rowCount = 30 colCount = 2
-0.0718789 -1.51331
10.5913 -4.57546
-16.9727 1.4941
3.77956 4.32881
-15.3369 0.249682
12.4385 1.70049
-7.75454 -0.788582
-4.84214 -4.0498
-5.26914 -2.89258
14.2182 -1.88916
16.3459 2.08956
-16.1623 1.83844
-15.0465 -0.955539
11.7977 -0.776369
16.6451 -2.02536
0.0166779 2.64832
-8.49655 -9.38027
6.33417 -0.221238
-14.9915 4.3328
-11.7268 -0.186444
-4.36014 0.081326
1.82138 -1.78243
-3.6607 7.96195
10.6481 5.76841
-16.4908 -1.74634
-0.688991 1.16266
7.57784 -0.0575147
13.2134 0.740099
14.0212 -2.40082
2.42236 0.844548

coeff=
rowCount = 4 colCount = 2
-0.624023 0.645564
-0.559191 -0.345639
-0.408334 -0.66047
-0.362166 0.166013

latent=
60.1632
17.7373

tsquared=
0.129198
3.0448
4.91403
1.29389
3.91322
2.73464
1.03456
1.31436
0.933193
3.56137
4.68724
4.53242
3.81451
2.34744
4.83641
0.39542
6.16062
0.669641
4.79397
2.28771
0.31636
0.234257
3.79671
3.76053
4.69206
0.0841019
0.95465
2.93288
3.59264
0.137745

Part2:对新数据进行映射计算!
rowCount = 30 colCount = 2
-0.0718789 -1.51331
10.5913 -4.57546
-16.9727 1.4941
3.77956 4.32881
-15.3369 0.249682
12.4385 1.70049
-7.75454 -0.788582
-4.84214 -4.0498
-5.26914 -2.89258
14.2182 -1.88916
16.3459 2.08956
-16.1623 1.83844
-15.0465 -0.955539
11.7977 -0.776369
16.6451 -2.02536
0.0166779 2.64832
-8.49655 -9.38027
6.33417 -0.221238
-14.9915 4.3328
-11.7268 -0.186444
-4.36014 0.081326
1.82138 -1.78243
-3.6607 7.96195
10.6481 5.76841
-16.4908 -1.74634
-0.688991 1.16266
7.57784 -0.0575147
13.2134 0.740099
14.0212 -2.40082
2.42236 0.844548

press any key to stop...

(二)二维空间图


(二)说明:

(1)在文本挖掘中,经常使用潜在语义分析LSI模型,进行去噪或降维。

(2)这里演示如何实现降维。

(3)orsci-dm包支持数据分析和数据挖掘计算,可下载配套软件orsci应用。

书籍 姜维. 《数据分析与数据挖掘》、《文本分析与文本挖掘》、《数据分析与数据挖掘建模与工具》。
软件 orsci-dm开发包(C++语言、Delphi语言和C语言)。