摘自《机器学习实践》第13章:
1 from numpy import * 2 import matplotlib 3 import matplotlib.pyplot as plt 4 5 def loadDataSet(fileName, delim='\t'): 6 fr = open(fileName) 7 stringArr = [line.strip().split(delim) for line in fr.readlines()] 8 datArr = [map(float, line) for line in stringArr] 9 return mat(datArr)10 11 def pca(dataMat, topNfeat = 999999):12 meanVals = mean(dataMat, axis = 0)13 meanRemoved = dataMat - meanVals14 covMat = cov(meanRemoved, rowvar = 0)15 eigVals, eigVects = linalg.eig(mat(covMat))16 eigValInd = argsort(eigVals)17 eigValInd = eigValInd[: - (topNfeat + 1): -1]18 redEigVects = eigVects[:, eigValInd]19 lowDDataMat = meanRemoved * redEigVects20 reconMat = (lowDDataMat * redEigVects.T) + meanVals21 return lowDDataMat, reconMat22 23 if __name__ == "__main__":24 dataMat = loadDataSet('testSet.txt')25 lowDMat, reconMat = pca(dataMat, 1)26 fig = plt.figure()27 ax = fig.add_subplot(111)28 ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s = 90)29 ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s = 50, c='red')30 plt.show()
输入数据格式: 文本文件,每行是两个以TAB键分隔的浮点数。
附运行结果: