Python-森林(随机森林、超随机树和旋转森林)--688IT编程网

Python-森林（随机森林、超随机树和旋转森林）

当输⼊数据中存在⾮线性关系的时候，基于线性回归的模型就会失效，⽽基于树的算法则不受数据中⾮线性关系的影响，基于树的⽅法最⼤的⼀个困扰时为了避免过拟合⽽对树进⾏剪枝的难度，对于潜在数据中的噪声，⼤型的树倾向于受影响，导致低偏差（过度拟合）或⾼⽅差（极度不拟合）。不过如果我们⽣成⼤量的树，最终的预测值采⽤集成所有树产⽣的输出的平均值，就可以避免⽅差的问题。

1. 随机森林：集成技术，采⽤⼤量的树来建模，但这⾥我们要保证树之间没有相互关联，不能选择所有属性，⽽是随机选择⼀个属性的⼦集给某个树。虽然我们再随机森林中构建最⼤深度的树，这样它们可以很好适应⾃举的样本，得到的偏差较低，后果是引⼊了⾼⽅差，但通过构建⼤量树，使⽤平均法则作为最后的预测值，可以解决⽅差问题。

2. 超随机树：⽐随机森林引⼊更多随机化，可以更⾼效地解决⽅差问题，它的运算复杂度也略有降低。随机森林是⾃举部分实例来给每棵树，但超随机树是使⽤完整的训练集数据，另外关于给定K作为给定节点随机选择的属性数量，它随机选择割点，不考虑⽬标变量，不像随机森林那样基于基尼不存度或熵标准。这种更多随机化带来的架构可以更好的降低⽅差。⽽且由于划分节点不需要相关标准，因此不需要花费时间来鉴定最适合⽤来划分数据集的属性。

3. 旋转森林：前两种需要集成⼤量的树才能获得好效果，⽽旋转森林可以⽤较⼩的树来获取相同甚⾄更

好的效果。算法场景是投票场景，属性被划分为⼤⼩相等的K个不重叠的⼦集，然后结合PCA、旋转矩阵来完成模型的构建。

随机森林：

# -*- coding: utf-8 -*-

"""

Created on Wed Apr 11 09:57:49 2018

@author: Alvin AI

"""

from sklearn.datasets import make_classification

ics import classification_report, accuracy_score

ss_validation import train_test_split

semble import RandomForestClassifier

id_search import RandomizedSearchCV

from operator import itemgetter

import numpy as np

#加⼊数据

def get_data():

no_features = 30

redundant_features = int(0.1*no_features)

informative_features = int(0.6*no_features)

repeated_features = int(0.1*no_features)

x,y = make_classification(n_samples=500,n_features=no_features,\

flip_y=0.03,n_informative=informative_features,\

n_redundant=redundant_features,\

n_repeated=repeated_features,random_state=7)

return x,y

#构建普通的随机森林树

def build_model(x,y,x_dev,y_dev):

no_trees = 100

estimator = RandomForestClassifier(n_estimators=no_trees)

estimator.fit(x,y)

train_predicted = estimator.predict(x)

train_score = accuracy_score(y,train_predicted)

dev_predicted = estimator.predict(x_dev)

dev_score = accuracy_score(y_dev,dev_predicted)

print "training accuracy = %0.2f dev accuracy = %0.2f" \

% (train_score,dev_score)

#寻森林所需的最优参数的⽅法

def search_parameters(x,y,x_dev,y_dev):

estimator = RandomForestClassifier()

no_features = x.shape[1]#每个属性值附给⼀个随机树

no_iterations = 20#树的⼤⼩，即深度

sqr_no_features = int(np.sqrt(no_features))

#随机在基尼系数和熵⾥选择，并⽤它作为标准在每次迭代中划分节点

#每次划分节点时随机选择m个属性，这个m就由max_features定义

#这⾥有个splitter参数，默认时best，在算法执⾏的时候基于max_features属性的内部选择划分机制

#best：从max_features参数定义的给定属性集合⾥选择最⼤可能划分；random：随机选择⼀个⽤来划分的属性 parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\

"criterion":["gini","entropy"],\

"max_features":[sqr_no_features,sqr_no_features*2,\

sqr_no_features*3,sqr_no_features+10]}

#n_jobs：在并⾏中评估器的最⼤数量，如果是-1，则⽤上所有CPU，如果是1则不并⾏计算

#交叉验证数为5份，⼀种20次迭代，因此构建的模型由100个

grid = RandomizedSearchCV(estimator=estimator,\

param_distributions=parameters,\

verbose=1,\

n_iter=no_iterations,\

random_state=77,\

n_jobs=-1,\

cv=5)

#print grid

grid.fit(x,y)

print_model_worth(grid,x_dev,y_dev)

return grid.best_estimator_,id_scores_

#打印出模型的效果评估值

def print_model_worth(grid,x_dev,y_dev):

#id_scores_：最佳的交叉验证分数

#key：指定第⼏个域。 itemgetter(1)⽤于获取对象第1维的数据。reverse=True：实现降序

scores = id_scores_,key=itemgetter(1),reverse=True) [0:5]#这⾥只选择前5个最好的模型看看 for model_no,score in enumerate(scores):

print "model %d, score = %0.3f" % (model_no+1,\

print "parameters = {0}".format(score.parameters)

dev_predicted = grid.predict(x_dev)

print classification_report(y_dev,dev_predicted)

#输出重要性特征

def get_feature_importance(model):

feature_importance = model.feature_importances_

fm_with_id = [(i,importance) for i,importance \

in enumerate(feature_importance)]

fm_with_id = sorted(fm_with_id,key=itemgetter(1),reverse=True)[0:10]

print "Top 10 Features"

for importance in fm_with_id:

print "feature %d importance = %0.3f" % (importance[0],importance[1])

#编写main函数

if __name__=="__main__":

x,y = get_data()

#数据集划分

x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\

test_size=0.3,random_state=9)

x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\

test_size=0.3,random_state=9)

#建⽴模型

build_model(x_train,y_train,x_dev,y_dev)

model,score_all= search_parameters(x,y,x_dev,y_dev)

get_feature_importance(model)

# get_feature_importance(model)

超随机树：

# -*- coding: utf-8 -*-

"""

Created on Wed Apr 11 14:46:59 2018

@author: Alvin AI

"""

from sklearn.datasets import make_classification

ics import classification_report,accuracy_score

ss_validation import train_test_split,cross_val_score

semble import ExtraTreesClassifier

id_search import RandomizedSearchCV

from operator import itemgetter

import numpy as np

def get_data():

no_features = 30

redundant_features = int(0.1*no_features)

informative_features = int(0.6*no_features)

repeated_features = int(0.1*no_features)

x,y = make_classification(n_samples=500,n_features=no_features,\

flip_y=0.3,n_informative=informative_features,\

n_repeated=repeated_features,\

n_redundant=redundant_features,\

random_state=7)

return x,y

def build_forest(x,y,x_dev,y_dev):random python

no_trees = 100

estimator = ExtraTreesClassifier(n_estimators=no_trees,random_state=51) estimator.fit(x,y)

train_predicted = estimator.predict(x)

train_score = accuracy_score(y,train_predicted)

dev_predicted = estimator.predict(x_dev)

dev_score = accuracy_score(y_dev,dev_predicted)

print "training accuracy = %0.2f\n dev accuracy = %0.2f" \

% (train_score,dev_score)

print "cross validation"

print cross_val_score(estimator,x_dev,y_dev,cv=5)

def print_model_worth(grid,x_dev,y_dev):

scores = id_scores_,key=itemgetter(1),reverse=True) [0:5] for model_no,score in enumerate(scores):

print "model %d, score = %0.3f" % (model_no+1,\

print "parameters = {0}".format(score.parameters)

dev_predicted = grid.predict(x_dev)

print classification_report(y_dev,dev_predicted)

def search_parameters(x,y,x_dev,y_dev):

estimator = ExtraTreesClassifier()

no_features = x.shape[1]

no_iterations = 20

sqr_no_features = int(np.sqrt(no_features))

parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\

"criterion":["gini","entropy"],\

"max_features":[sqr_no_features,sqr_no_features*2,\

sqr_no_features*3,sqr_no_features+10]}

grid = RandomizedSearchCV(estimator=estimator,\

param_distributions=parameters,\

verbose=1,\

n_iter=no_iterations,\

random_state=77,\

n_jobs=-1,\

cv=5)

grid.fit(x.y)

print_model_worth(grid,x_dev,y_dev)

return grid.best_estimator_

if __name__=="__main__":

x,y = get_data()

x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size=0.3,\ random_state=9)

x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\

test_size=0.3,random_state=9)

build_forest(x_train,y_train,x_dev,y_dev)

model = search_parameters(x,y,x_dev,y_dev)

旋转森林：

# -*- coding: utf-8 -*-

"""

Created on Wed Apr 11 17:01:22 2018

@author: Alvin AI

"""

from sklearn.datasets import make_classification

ics import classification_report

ss_validation import train_test_split

from sklearn.decomposition import PCA

import DecisionTreeClassifier

import numpy as np

#加载数据

def get_data():

no_features = 50

redundant_features = int(0.1*no_features)

informative_features = int(0.6*no_features)

repeated_features = int(0.1*no_features)

x,y = make_classification(n_samples=500,n_features=no_features,\ flip_y=0.03,n_informative=informative_features,\

n_redundant=redundant_features,\

n_repeated=repeated_features,random_state=7)

return x,y

#得到随机⼦集

def get_random_subset(iterable,k):

subsets = []

iteration = 0

np.random.shuffle(iterable)#打乱特征索引

subset = 0

limit = len(iterable)/k

while iteration < limit:

if k <= len(iterable):

subset = k

else:

subset = len(iterable)

subsets.append(iterable[-subset:])

del iterable[-subset:]

iteration += 1

return subsets

#建⽴旋转森林模型

def build_rotationtree_model(x_train,y_train,d,k):

models = []#决策树

r_matrices = []#与树相关的旋转矩阵

feature_subsets =[]#迭代中⽤到的特征⼦集

for i in range(d):

x,_,_ ,_= train_test_split(x_train,y_train,test_size=0.3,random_state=7)

#特征的索引

feature_index = range(x.shape[1])

#获取特征的⼦集

#10个⼦集，每个⼦集包含5个索引

random_k_subset = get_random_subset(feature_index,k)#10个⼦集

feature_subsets.append(random_k_subset)#25个树，每个树10个⼦集

R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float)#旋转矩阵

for each_subset in random_k_subset:

pca = PCA()

x_subset = x[:,each_subset]#提取出⼦集内索引对应的x值

pca.fit(x_subset)#主成分分析

for ii in range(0,len(pcaponents_)):

for jj in range(0,len(pcaponents_)):

R_matrix[each_subset[ii],each_subset[jj]] =\

pcaponents_[ii,jj]

x_transformed = x_train.dot(R_matrix)

model = DecisionTreeClassifier()

model.fit(x_transformed,y_train)

models.append(model)

r_matrices.append(R_matrix)

return models,r_matrices,feature_subsets

def model_worth(models,r_matrices,x,y):

predicted_ys = []

for i,model in enumerate(models):

x_mod = x.dot(r_matrices[i])

predicted_y = model.predict(x_mod)

predicted_ys.append(predicted_y)

predicted_matrix = np.asmatrix(predicted_ys)#转化为矩阵 25*350

final_prediction = []

for i in range(len(y)):

pred_from_all_models = np.ravel(predicted_matrix[:,i])#将多维数组降为⼀维

non_zero_pred = np.nonzero(pred_from_all_models)[0]#nonzeros(a)返回数组a中值不为零的元素的下标 is_one = len(non_zero_pred) > len(models)/2#如果⾮0预测⼤于模型内树的总数的⼀半则为1

final_prediction.append(is_one)

print classification_report(y,final_prediction)

return predicted_matrix

#主函数

if __name__=="__main__":

x,y = get_data()

#数据集划分

x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\

test_size=0.3,random_state=9)

x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\

test_size=0.3,random_state=9)

models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5)#输的数量25，要⽤的特征⼦集5 predicted_matrix1=model_worth(models,r_matrices,x_train,y_train)

predicted_matrix2=model_worth(models,r_matrices,x_dev,y_dev)

688IT编程网

Python-森林(随机森林、超随机树和旋转森林)

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林算法的改进方法

基于随机森林算法的风险预警模型研究

Python中的随机森林算法详解

随机森林发展历史

如何使用随机森林进行时间序列数据模式识别(八)

随机森林回归模型原理

如何使用随机森林进行时间序列数据模式识别(六)

如何使用随机森林进行时间序列数据预测(四)

如何使用随机森林进行异常检测(六)

随机森林算法和grandientboosting算法 -回复

随机森林方法总结全面

随机森林算法原理和步骤

随机森林的原理

随机森林重要性

随机森林算法

机器学习中随机森林的原理

随机森林算法原理

使用计算机视觉技术进行动物识别的技巧

基于crf命名实体识别实验总结

transformer预测模型训练方法

最新文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

随机森林结合直接正交信号校正的模型传递方法

标签列表

688IT编程网

Python-森林(随机森林、超随机树和旋转森林)

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林算法的改进方法

基于随机森林算法的风险预警模型研究

Python中的随机森林算法详解

随机森林发展历史

如何使用随机森林进行时间序列数据模式识别(八)

随机森林回归模型原理

如何使用随机森林进行时间序列数据模式识别(六)

如何使用随机森林进行时间序列数据预测(四)

如何使用随机森林进行异常检测(六)

随机森林算法和grandientboosting算法 -回复

随机森林方法总结全面

随机森林算法原理和步骤

随机森林的原理

随机森林 重要性

随机森林算法

机器学习中随机森林的原理

随机森林算法原理

使用计算机视觉技术进行动物识别的技巧

基于crf命名实体识别实验总结

transformer预测模型训练方法

最新文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

随机森林结合直接正交信号校正的模型传递方法

标签列表

随机森林重要性