机器学习第五章代码.docx
51 Logistic 回归梯度上升优算法 #加载数据,前两列是点所属的 X1, X2 坐标, 最后一列是该点所属分类。from numpy import *def loadDataSet(): #加载数据 dataMat = ; labelMat = fr = open('testSet.txt') #打开文本文件testSet.txt for line in fr.readlines(): #逐行读取 lineArr = line.strip().split() dataMat.append(1.0,float(lineArr0),float(lineArr1) # 因为线性回归化式为 H(x) = W0 + W1*X1 + W2*X2即为 (W0, W1, W2)*(1, X1, X2), 其中 (W0, W1, W2) 即为所求回归系数 W。 为了方便计算, 读出 X1, X2 后要在前面补上一个 1.0 labelMat.append(int(lineArr2)return dataMat,labelMat#计算 sigmoid 函数def sigmoid(inX): return 1.0/(1+exp(-inX)# 梯度上升算法计算出最佳回归系数def gradAscent(dataMatIn, classLabels): dataMatrix = mat(dataMatIn) #转换为NumPy矩阵数据类型 labelMat = mat(classLabels).transpose() #转换为NumPy矩阵数据类型 m,n = shape(dataMatrix) alpha = 0.001 # 步长 maxCycles = 500 # 循环次数 weights = ones(n,1) # 回归系数初始化为1#循环 maxCycles次, 每次都沿梯度向真实值 labelMat 靠拢For k in range(maxCycles): h = sigmoid(dataMatrix*weights) # 矩阵相乘 error=(labelMat-h) #向量减法运算 weights = weights + alpha * dataMatrix.transpose()* error #矩阵相乘,dataMatrix.transpose()* error 就是梯度f(w) return weights52 画出数据集和Logistic回归最佳拟合直线的函数。#画出各个训练点,根据 weights(即回归的各个参数) 画出直线, 以便直观的看到划分是否正确def plotBestFit(weights): import matplotlib.pyplot as plt # 画点 dataMat,labelMat=loadDataSet() dataArr = array(dataMat) n = shape(dataArr)0 xcord1 = ; ycord1 = xcord2 = ; ycord2 = for i in range(n): if int(labelMati)= 1: xcord1.append(dataArri,1);ycord1.append(dataArri,2) else: xcord2.append(dataArri,1);ycord2.append(dataArri,2) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')ax.scatter(xcord2, ycord2, s=30, c='green') # 画线 x= arange(-3.0, 3.0, 0.1) # x取值区间为 -3.0, 3.0),步长为 0.1 y = (-weights0-weights1*x1)/weights2 ax.plot(x,y) plt.xlabel('X1'); plt.ylabel('X2');# 显示 plt.show() 53 随机梯度上升算法def stocGradAscent0(dataMatrix, classLabels): m,n = shape(dataMatrix) alpha = 0.01 weights = ones(n) #初始化为1的矩阵 for i in range(m): h = sigmoid(sum(dataMatrixi*weights) error = classLabelsi - h weights = weights + alpha * error * dataMatrixireturn weights54 改进的随机梯度上升算法 def stocGradAscent1(dataMatrix, classLabels, numIter=150): m,n = shape(dataMatrix) weights = ones(n) #初始化为1的矩阵 for j in range(numIter): dataIndex = range(m) for i in range(m): alpha = 4/(1.0+j+i)+0.0001 #alpha值每次迭代时都进行调整,会缓解数据波动或者高频波动 randIndex=int(random.uniform(0,len(dataIndex)#随机选取更新回归系数,可以减少周期性波动 h= sigmoid(sum(dataMatrixrandIndex*weights) error = classLabelsrandIndex - h weights=weights+alpha*error*dataMatrixrandIndex del(dataIndexrandIndex) return weights