机器学习之XGboost代码实现
XGboost代码实现
XGboost的原⽣语⾔是c++,所以它的运⾏速度⽐较快。这⾥将会举例介绍⼀下python中如何调⽤XGboost,如果想了解XGboost的理论部分,可以参考
⽂章⽬录
例1: XGboost基本应⽤
数据集:以鸢尾花数据集为例
# /usr/bin/python
# -*- encoding:utf-8 -*-
from sklearn.datasets import load_iris
del_selection import train_test_split
import xgboost as xgb
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print (acc)
print (tip + '正确率:\t', float(acc.sum()) / a.size)
if __name__ == "__main__":
#加载iris数据集
data=load_iris()
X=data.data
Y=data.target
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=1)  #训练集和测试集
data_train = xgb.DMatrix(X_train,label=y_train)
data_test = xgb.DMatrix(X_test,label=y_test)
print (data_train)
print (type(data_train))
# 设置参数
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax','num_class': 3} # logitraw
watchlist = [(data_test, 'eval'), (data_train, 'train')]
n_round = 7
bst = ain(param, data_train, num_boost_round=4, evals=watchlist)
y_hat = bst.predict(data_test)
show_accuracy(y_hat, y_test, 'XGBoost ')
splitwise结果显⽰:
bst = ain(param, data_train, num_boost_round=4, evals=watchlist)
y_hat = bst.predict(data_test)
arly_stopping_rounds=None,evals_result=None,verbo se_eval=True,learning_rates=None,xgb_model=None)
1、params :这是⼀个字典,⾥⾯包含着训练中的参数关键字和对应的值
param = {‘max_depth’: 3, ‘eta’: 1, ‘silent’: 1, ‘objective’: ‘multi:softmax’,‘num_class’: 3}
‘max_depth’:树的深度,默认值是6
‘eta’:步长⽤于防⽌过拟合,范围是0~1,默认值0.3
‘silent’: 设置成1则没有运⾏信息输出,设置为0有运⾏信息输出.
‘objective’:指定你想要的类型的学习者,包括线性回归、逻辑回归、泊松回归等,默认值设置为reg:linear “reg:linear” :线性回归(默认).
“reg:logistic”:逻辑回归
“binary:logistic”:⼆分类的逻辑回归------输出概率值
“binary:logitraw”:⼆分类的逻辑回归,输出的结果为wTx。
“multi:softmax”:使⽤SoftMax进⾏多分类
“num_class”:设置类的数量。 仅⽤于多类⽬标。
“multi:softprob”:与softmax类似,预测输出ndata * nclass元素的向量,结果包含属于每个类的每个数据点的预测概率。
“rank:pairwise”:set xgboost to do ranking task by minimizing the pairwise loss
2、 dtrain :训练的数据
3、num_boost_round:提升迭代的个数
4、evals :⽤于对训练过程中进⾏评估列表中的元素。形式是evals = [(dtrain,’train’), (dval,’val’)]或者是evals = [
(dtrain,’train’)]
5、obj:⾃定义⽬标函数
6、feval:⾃定义评估函数
7、maximize: 是否对评估函数进⾏最⼤化
8、early_stopping_rounds:早期停⽌次数 ,假设为100,验证集的误差迭代到⼀定程度在100次内不能再继续降低,就停⽌迭代。
9、evals_result :字典,存储在watchlist 中的元素的评估结果
10、verbose_eval(可以输⼊布尔型或数值型):如果为True, 则对evals中元素的评估结果会输出在结果中;如果输⼊数字,假设为5,则每隔5个迭代输出⼀次。
11、learning_rates :每⼀次提升的学习率的列表
12、xgb_model:在训练之前⽤于加载的xgb model
去掉evals,结果变为:
watchlist = [(data_test, ‘eval’), (data_train, ‘train’)]
evals=watchlist
#bst = ain(param, data_train, num_boost_round=4, evals=watchlist)
bst = ain(param, data_train, num_boost_round=4) #去掉watchlist
结果如下:
例2: ⾃定义XGboost损失函数的梯度和⼆阶导
我们要⾃定义损失函数的梯度和⼆阶导,这⾥不使⽤iris数据集了,因为样本数量⾄于150个,太少了。这⾥来预测蘑菇是否有毒。代码如下:
# /usr/bin/python
# -*- encoding:utf-8 -*-
import xgboost as xgb
import numpy as np
# 1、xgBoost的基本使⽤
# 2、⾃定义损失函数的梯度和⼆阶导
# 3、binary:logistic/logitraw
# 定义f: theta * x
def log_reg(y_hat, y):
p = 1.0 / (1.0 + np.exp(-y_hat))  #认为⾃定义损失函数
g = p - y.get_label()  #⼀阶导
h = p * (1.0-p)        #⼆阶导
return g, h
def error_rate(y_hat, y):
return 'error', float(_label() != (y_hat > 0.5))) / len(y_hat)
if __name__ == "__main__":
# 读取数据
data_train = xgb.DMatrix('14.')
data_test = xgb.DMatrix('14.')
print (data_train)
print (type(data_train))
# 设置参数
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # logitraw
watchlist = [(data_test, 'eval'), (data_train, 'train')]
n_round = 7
bst = ain(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)#feval 误差率
# 计算错误率
y_hat = bst.predict(data_test)
y = _label()
print (y_hat)
print (y)
error = sum(y != (y_hat > 0.5))
error_rate = float(error) / len(y_hat)
print ('样本总数:\t', len(y_hat))
print ('错误数⽬:\t%4d' % error)
print ('错误率:\t%.5f%%' % (100*error_rate))
例三:将⼏种算法逻辑回归、XGboost、随机森林、bagging、adaboost作对⽐注:所得结果不能作为评判这些算法好坏的标准。
# /usr/bin/python
# -*- coding:utf-8 -*-
import xgboost as xgb
import numpy as np
import scipy.sparse
del_selection import train_test_split
from sklearn.linear_model import LogisticRegression
semble import RandomForestClassifier
semble import BaggingClassifier
import DecisionTreeClassifier
semble import AdaBoostClassifier
#稀疏数据存储
def read_data(path):
y = []
row = []
col = []
values = []
r = 0      # ⾸⾏
for d in open(path):
d = d.strip().split()      # 以空格分开
y.append(int(d[0]))
d = d[1:]
for c in d:
key, value = c.split(':')
row.append(r)
col.append(int(key))
values.append(float(value))
r += 1
x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
y = np.array(y)
return x, y
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
#print (acc)
print (tip + '正确率:\t', float(acc.sum()) / a.size)
if __name__ == '__main__':
x_train, y_train = read_data('14.')
x_test, y_test = read_data('14.')
#x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)
# Logistic回归
lr = LogisticRegression(penalty='l1')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
print (">>####Logistic回归>>###")
show_accuracy(y_hat, y_test, 'Logistic回归 ')
# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 2, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = ain(param, data_train, num_boost_round=4, evals=watch_list)  #evals=watch_list动态卡⼀看
y_hat = bst.predict(data_test)
print(">>####XGboost>>###")
show_accuracy(y_hat, y_test, 'XGBoost ')
#随机森林
rnd_clf = RandomForestClassifier(n_estimators=10, max_leaf_nodes=7, n_jobs=1)
rnd_clf.fit(x_train, y_train)
y_hat = rnd_clf.predict(x_test)
print(">>##随机森林 >>>")
show_accuracy(y_hat, y_test, '随机森林 ')
#bagging
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None)
clf = BaggingClassifier(base_estimator=tree, n_estimators=6, max_samples=1.0, max_features=1.0, bootstrap=True,                            bootstrap_features=False, n_jobs=1, random_state=1)
clf.fit(x_train, y_train)
y_hat = clf.predict(x_test)
print(">>####bagging>>###")
show_accuracy(y_hat, y_test, 'bagging ')
#adaboost
ada_real = AdaBoostClassifier(base_estimator=tree, learning_rate=0.5,
n_estimators=6, algorithm='SAMME.R')  # 相⽐于ada_discrete只改变了Algorithm参数
ada_real.fit(x_train, y_train)
y_hat = clf.predict(x_test)
print(">>####adaboost>>###")
show_accuracy(y_hat, y_test, 'adaboost')
结果如下

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。

发表评论