Python-森林(随机森林、超随机树和旋转森林)
当输⼊数据中存在⾮线性关系的时候,基于线性回归的模型就会失效,⽽基于树的算法则不受数据中⾮线性关系的影响,基于树的⽅法最⼤的⼀个困扰时为了避免过拟合⽽对树进⾏剪枝的难度,对于潜在数据中的噪声,⼤型的树倾向于受影响,导致低偏差(过度拟合)或⾼⽅差(极度不拟合)。不过如果我们⽣成⼤量的树,最终的预测值采⽤集成所有树产⽣的输出的平均值,就可以避免⽅差的问题。
1. 随机森林:集成技术,采⽤⼤量的树来建模,但这⾥我们要保证树之间没有相互关联,不能选择所有属性,⽽是随机选择⼀个属性的⼦集给某个树。虽然我们再随机森林中构建最⼤深度的树,这样它们可以很好适应⾃举的样本,得到的偏差较低,后果是引⼊了⾼⽅差,但通过构建⼤量树,使⽤平均法则作为最后的预测值,可以解决⽅差问题。
2. 超随机树:⽐随机森林引⼊更多随机化,可以更⾼效地解决⽅差问题,它的运算复杂度也略有降低。随机森林是⾃举部分实例来给每棵树,但超随机树是使⽤完整的训练集数据,另外关于给定K作为给定节点随机选择的属性数量,它随机选择割点,不考虑⽬标变量,不像随机森林那样基于基尼不存度或熵标准。这种更多随机化带来的架构可以更好的降低⽅差。⽽且由于划分节点不需要相关标准,因此不需要花费时间来鉴定最适合⽤来划分数据集的属性。
3. 旋转森林:前两种需要集成⼤量的树才能获得好效果,⽽旋转森林可以⽤较⼩的树来获取相同甚⾄更
好的效果。算法场景是投票场景,属性被划分为⼤⼩相等的K个不重叠的⼦集,然后结合PCA、旋转矩阵来完成模型的构建。
随机森林:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 09:57:49 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
ics import classification_report, accuracy_score
ss_validation import train_test_split
semble import RandomForestClassifier
id_search import RandomizedSearchCV
from operator import itemgetter
import numpy as np
#加⼊数据
def get_data():
no_features = 30
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#构建普通的随机森林树
def build_model(x,y,x_dev,y_dev):
no_trees = 100
estimator = RandomForestClassifier(n_estimators=no_trees)
estimator.fit(x,y)
train_predicted = estimator.predict(x)
train_score = accuracy_score(y,train_predicted)
dev_predicted = estimator.predict(x_dev)
dev_score = accuracy_score(y_dev,dev_predicted)
print "training accuracy = %0.2f dev accuracy = %0.2f" \
% (train_score,dev_score)
#寻森林所需的最优参数的⽅法
def search_parameters(x,y,x_dev,y_dev):
estimator = RandomForestClassifier()
estimator = RandomForestClassifier()
no_features = x.shape[1]#每个属性值附给⼀个随机树
no_iterations = 20#树的⼤⼩,即深度
sqr_no_features = int(np.sqrt(no_features))
#随机在基尼系数和熵⾥选择,并⽤它作为标准在每次迭代中划分节点
#每次划分节点时随机选择m个属性,这个m就由max_features定义
#这⾥有个splitter参数,默认时best,在算法执⾏的时候基于max_features属性的内部选择划分机制
#best:从max_features参数定义的给定属性集合⾥选择最⼤可能划分;random:随机选择⼀个⽤来划分的属性 parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\
"criterion":["gini","entropy"],\
"max_features":[sqr_no_features,sqr_no_features*2,\
sqr_no_features*3,sqr_no_features+10]}
#n_jobs:在并⾏中评估器的最⼤数量,如果是-1,则⽤上所有CPU,如果是1则不并⾏计算
#交叉验证数为5份,⼀种20次迭代,因此构建的模型由100个
grid = RandomizedSearchCV(estimator=estimator,\
param_distributions=parameters,\
verbose=1,\
n_iter=no_iterations,\
random_state=77,\
n_jobs=-1,\
cv=5)
#print grid
grid.fit(x,y)
print_model_worth(grid,x_dev,y_dev)
return grid.best_estimator_,id_scores_
#打印出模型的效果评估值
def print_model_worth(grid,x_dev,y_dev):
#id_scores_:最佳的交叉验证分数
#key:指定第⼏个域。 itemgetter(1)⽤于获取对象第1维的数据。reverse=True:实现降序
scores = id_scores_,key=itemgetter(1),reverse=True) [0:5]#这⾥只选择前5个最好的模型看看 for model_no,score in enumerate(scores):
print "model %d, score = %0.3f" % (model_no+1,\
print "parameters = {0}".format(score.parameters)
print
dev_predicted = grid.predict(x_dev)
print classification_report(y_dev,dev_predicted)
#输出重要性特征
def get_feature_importance(model):
feature_importance = model.feature_importances_
fm_with_id = [(i,importance) for i,importance \
in enumerate(feature_importance)]
fm_with_id = sorted(fm_with_id,key=itemgetter(1),reverse=True)[0:10]
print "Top 10 Features"
for importance in fm_with_id:
print "feature %d importance = %0.3f" % (importance[0],importance[1])
#编写main函数
if __name__=="__main__":
x,y = get_data()
#数据集划分
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
#建⽴模型
build_model(x_train,y_train,x_dev,y_dev)
model,score_all= search_parameters(x,y,x_dev,y_dev)
get_feature_importance(model)
# get_feature_importance(model)
超随机树:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 14:46:59 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
ics import classification_report,accuracy_score
ss_validation import train_test_split,cross_val_score
semble import ExtraTreesClassifier
id_search import RandomizedSearchCV
from operator import itemgetter
import numpy as np
def get_data():
no_features = 30
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.3,n_informative=informative_features,\
n_repeated=repeated_features,\
n_redundant=redundant_features,\
random_state=7)
return x,y
def build_forest(x,y,x_dev,y_dev):random python
no_trees = 100
estimator = ExtraTreesClassifier(n_estimators=no_trees,random_state=51) estimator.fit(x,y)
train_predicted = estimator.predict(x)
train_score = accuracy_score(y,train_predicted)
dev_predicted = estimator.predict(x_dev)
dev_score = accuracy_score(y_dev,dev_predicted)
print "training accuracy = %0.2f\n dev accuracy = %0.2f" \
% (train_score,dev_score)
print "cross validation"
print cross_val_score(estimator,x_dev,y_dev,cv=5)
def print_model_worth(grid,x_dev,y_dev):
scores = id_scores_,key=itemgetter(1),reverse=True) [0:5] for model_no,score in enumerate(scores):
print "model %d, score = %0.3f" % (model_no+1,\
print "parameters = {0}".format(score.parameters)
print
dev_predicted = grid.predict(x_dev)
print classification_report(y_dev,dev_predicted)
def search_parameters(x,y,x_dev,y_dev):
estimator = ExtraTreesClassifier()
no_features = x.shape[1]
no_iterations = 20
sqr_no_features = int(np.sqrt(no_features))
parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\
"criterion":["gini","entropy"],\
"max_features":[sqr_no_features,sqr_no_features*2,\
sqr_no_features*3,sqr_no_features+10]}
grid = RandomizedSearchCV(estimator=estimator,\
grid = RandomizedSearchCV(estimator=estimator,\
param_distributions=parameters,\
verbose=1,\
n_iter=no_iterations,\
random_state=77,\
n_jobs=-1,\
cv=5)
grid.fit(x.y)
print_model_worth(grid,x_dev,y_dev)
return grid.best_estimator_
if __name__=="__main__":
x,y = get_data()
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size=0.3,\ random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
build_forest(x_train,y_train,x_dev,y_dev)
model = search_parameters(x,y,x_dev,y_dev)
旋转森林:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 17:01:22 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
ics import classification_report
ss_validation import train_test_split
from sklearn.decomposition import PCA
import DecisionTreeClassifier
import numpy as np
#加载数据
def get_data():
no_features = 50
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\ flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#得到随机⼦集
def get_random_subset(iterable,k):
subsets = []
iteration = 0
np.random.shuffle(iterable)#打乱特征索引
subset = 0
limit = len(iterable)/k
while iteration < limit:
if k <= len(iterable):
subset = k
else:
subset = len(iterable)
subsets.append(iterable[-subset:])
del iterable[-subset:]
iteration += 1
return subsets
#建⽴旋转森林模型
def build_rotationtree_model(x_train,y_train,d,k):
models = []#决策树
r_matrices = []#与树相关的旋转矩阵
feature_subsets =[]#迭代中⽤到的特征⼦集
for i in range(d):
x,_,_ ,_= train_test_split(x_train,y_train,test_size=0.3,random_state=7)
#特征的索引
feature_index = range(x.shape[1])
#获取特征的⼦集
#10个⼦集,每个⼦集包含5个索引
random_k_subset = get_random_subset(feature_index,k)#10个⼦集
feature_subsets.append(random_k_subset)#25个树,每个树10个⼦集
R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float)#旋转矩阵
for each_subset in random_k_subset:
pca = PCA()
x_subset = x[:,each_subset]#提取出⼦集内索引对应的x值
pca.fit(x_subset)#主成分分析
for ii in range(0,len(pcaponents_)):
for jj in range(0,len(pcaponents_)):
R_matrix[each_subset[ii],each_subset[jj]] =\
pcaponents_[ii,jj]
x_transformed = x_train.dot(R_matrix)
model = DecisionTreeClassifier()
model.fit(x_transformed,y_train)
models.append(model)
r_matrices.append(R_matrix)
return models,r_matrices,feature_subsets
def model_worth(models,r_matrices,x,y):
predicted_ys = []
for i,model in enumerate(models):
x_mod = x.dot(r_matrices[i])
predicted_y = model.predict(x_mod)
predicted_ys.append(predicted_y)
predicted_matrix = np.asmatrix(predicted_ys)#转化为矩阵 25*350
final_prediction = []
for i in range(len(y)):
pred_from_all_models = np.ravel(predicted_matrix[:,i])#将多维数组降为⼀维
non_zero_pred = np.nonzero(pred_from_all_models)[0]#nonzeros(a)返回数组a中值不为零的元素的下标 is_one = len(non_zero_pred) > len(models)/2#如果⾮0预测⼤于模型内树的总数的⼀半则为1
final_prediction.append(is_one)
print classification_report(y,final_prediction)
return predicted_matrix
#主函数
if __name__=="__main__":
x,y = get_data()
#数据集划分
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5)#输的数量25,要⽤的特征⼦集5 predicted_matrix1=model_worth(models,r_matrices,x_train,y_train)
predicted_matrix2=model_worth(models,r_matrices,x_dev,y_dev)
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论