コード
$ cat learn_test.py
import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score #df=pd.read_table("test.txt",sep=",",header=None) df=pd.read_csv("test.txt",sep=",") y=df.y x=df.drop("y",axis=1) #x=df[["x1","x2","x3"]] #x=df[[1,2,3]] print "Target:" print y print "Explain" print x x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) cf1=LogisticRegression() cf1.fit(x_train,y_train) y_test_pred=cf1.predict(x_test) y_test_prob=(cf1.predict_proba(x_test)[:,1]) print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred)) #print("Accuracy: %.2f" % (y_test!=y_pred).sum()) print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob}) print "Coefficients:" print pd.DataFrame(cf1.coef_,columns=x.keys())
入力ファイル
$ cat test.txt
y,x1,x2,x3
0,2,1,-8
0,5,6,7
0,2,10,-12
0,3,1,3
0,3,1,4
1,3,10,-11
1,4,13,-12
1,3,12,-11
1,2,11,-15
1,1,11,-16
1,3,16,-11
出力:
$ python learn_test.py
Target:
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Name: y, dtype: int64
Explain
x1 x2 x3
0 2 1 -8
1 5 6 7
2 2 10 -12
3 3 1 3
4 3 1 4
5 3 10 -11
6 4 13 -12
7 3 12 -11
8 2 11 -15
9 1 11 -16
10 3 16 -11
Accuracy: 0.75
Anser Prediction Prob
4 0 0 0.062660
9 1 1 0.995225
2 0 1 0.975860
10 1 1 0.996505
Coefficients:
x1 x2 x3
0 -0.852768 0.48049 -0.076725
さまざまな評価指標を加えたもの
$ cat learn_test.py import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_curve, roc_auc_score #df=pd.read_table("test.txt",sep=",",header=None) df=pd.read_csv("test.txt",sep=",") y=df.y x=df.drop("y",axis=1) #x=df[["x1","x2","x3"]] #x=df[[1,2,3]] print "Target:" print y print "Explain" print x x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) cf1=LogisticRegression() cf1.fit(x_train,y_train) y_test_pred=cf1.predict(x_test) y_test_prob=(cf1.predict_proba(x_test)[:,1]) print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred)) print "Score:" print cf1.score(x_test,y_test) print "Confusion_matrix:" print confusion_matrix(y_test, y_test_pred) print "Classification_report:" print classification_report(y_test, y_test_pred) #AUC print "AUC:" fpr, tpr, _ = roc_curve(y_test, y_test_pred) auc=roc_auc_score(y_test, y_test_pred) print auc print "AR:" print 2*auc-1 #print("Accuracy: %.2f" % (y_test!=y_pred).sum()) print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob}) print "Coefficients:" print pd.DataFrame(cf1.coef_,columns=x.keys())
$ python learn_test.py
Target:
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Name: y, dtype: int64
Explain
x1 x2 x3
0 2 1 -8
1 5 6 7
2 2 10 -12
3 3 1 3
4 3 1 4
5 3 10 -11
6 4 13 -12
7 3 12 -11
8 2 11 -15
9 1 11 -16
10 3 16 -11
Accuracy: 0.75
Score:
0.75
Confusion_matrix:
[[1 1]
[0 2]]
Classification_report:
precision recall f1-score support0 1.00 0.50 0.67 2
1 0.67 1.00 0.80 2avg / total 0.83 0.75 0.73 4
AUC:
0.75
AR:
0.5
Anser Prediction Prob
4 0 0 0.062660
9 1 1 0.995225
2 0 1 0.975860
10 1 1 0.996505
Coefficients:
x1 x2 x3
0 -0.852768 0.48049 -0.076725
- カテゴリカルの場合は、カテゴリデータの変換の必要あり
コード
$ cat learn2.py
#-*- coding: utf-8 -*- import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_curve, roc_auc_score from sklearn.feature_extraction import DictVectorizer as DV #df=pd.read_table("test.txt",sep=",",header=None) #df=pd.read_csv("test.txt",sep=",") #df=pd.read_csv("test.txt",sep=",",header=None) #print df #df=pd.read_csv("data_first_false_select.dat",sep=",",header=None,dtype={5:'f2'},na_values=["−"],index_col=0) df=pd.read_csv("test2.txt",sep=",") print "origin" print df #y=df[[1]] #x=df[range(2,(df.shape[1]))] y=df[[0]] x=df[range(1,(df.shape[1]))] cat_key=["x4","x5"] cat_x=x[cat_key] cat_dict=cat_x.T.to_dict().values() vectorizer = DV( sparse = False ) vec_x_cat_train = vectorizer.fit_transform(cat_dict) vec_x_names=vectorizer.get_feature_names() x_drop=x.drop(cat_key,axis=1) x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names),left_index=True,right_index=True) #print cat_x #exit() #quit() #y=df[:2] #x=df.drop("y",axis=1) #x=df.drop("y",axis=1) #y=df[[1]] #print 2:(df.shape[1]-1) #x=df[range(2,(df.shape[1]))] #exit() #print y #x=df.drop("y",axis=1) #x=df[["x1","x2","x3"]] #x=df[[1,2,3]] print "Target:" print y print "Explain" print x #exit() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #exit() cf1=LogisticRegression() cf1.fit(x_train,y_train) y_test_pred=cf1.predict(x_test) y_train_pred=cf1.predict(x_train) y_test_prob=(cf1.predict_proba(x_test)[:,1]) y_train_prob=(cf1.predict_proba(x_train)[:,1]) print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred)) print "Score:" print cf1.score(x_test,y_test) print "Confusion_matrix(test train):" print "test" print confusion_matrix(y_test, y_test_pred) print "train" print confusion_matrix(y_train, y_train_pred) print "Classification_report (test train):" print "test:" print classification_report(y_test, y_test_pred) print "train:" print classification_report(y_train, y_train_pred) #AUC print "AUC (test train):" fpr, tpr, _ = roc_curve(y_test, y_test_prob) auc=roc_auc_score(y_test, y_test_prob) auc_train=roc_auc_score(y_train, y_train_prob) print auc,auc_train print "AR (test train):" print 2*auc-1,2*auc_train-1 #print("Accuracy: %.2f" % (y_test!=y_pred).sum()) print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob}) print "Coefficients:" print pd.DataFrame(cf1.coef_,columns=x.keys())
入力
$ cat test2.txt
y,x1,x2,x3,x4,x5
0,2,1,-8,M,T
0,5,6,7,M,O
0,2,10,-12,F,N
0,3,1,3,M,N
0,3,1,4,F,O
1,3,10,-11,M,N
1,4,13,-12,M,N
1,3,12,-11,M,O
1,2,11,-15,F,T
1,1,11,-16,F,T
1,3,16,-11,F,O
出力
$ python learn2.py
origin
y x1 x2 x3 x4 x5
0 0 2 1 -8 M T
1 0 5 6 7 M O
2 0 2 10 -12 F N
3 0 3 1 3 M N
4 0 3 1 4 F O
5 1 3 10 -11 M N
6 1 4 13 -12 M N
7 1 3 12 -11 M O
8 1 2 11 -15 F T
9 1 1 11 -16 F T
10 1 3 16 -11 F O
Target:
y
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Explain
x1 x2 x3 x4=F x4=M x5=N x5=O x5=T
0 2 1 -8 0 1 0 0 1
1 5 6 7 0 1 0 1 0
2 2 10 -12 1 0 1 0 0
3 3 1 3 0 1 1 0 0
4 3 1 4 1 0 0 1 0
5 3 10 -11 0 1 1 0 0
6 4 13 -12 0 1 1 0 0
7 3 12 -11 0 1 0 1 0
8 2 11 -15 1 0 0 0 1
9 1 11 -16 1 0 0 0 1
10 3 16 -11 1 0 0 1 0
/usr/lib/python2.7/site-packages/sklearn/utils/validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Accuracy: 0.75
Score:
0.75
Confusion_matrix(test train):
test
[[1 1]
[0 2]]
train
[[3 0]
[0 4]]
Classification_report (test train):
test:
precision recall f1-score support0 1.00 0.50 0.67 2
1 0.67 1.00 0.80 2avg / total 0.83 0.75 0.73 4
train:
precision recall f1-score support0 1.00 1.00 1.00 3
1 1.00 1.00 1.00 4avg / total 1.00 1.00 1.00 7
AUC (test train):
1.0 1.0
AR (test train):
1.0 1.0
Anser Prediction Prob
0 (y,) 0 0.072001
1 (y,) 1 0.995499
2 (y,) 1 0.982635
3 (y,) 1 0.997178
Coefficients:
x1 x2 x3 x4=F x4=M x5=N x5=O \
0 -0.760449 0.46168 -0.099912 0.010561 -0.296954 0.01677 -0.061206x5=T
0 -0.241957
実践編:
$ cat learn_aprt2.py #-*- coding: utf-8 -*- import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_curve, roc_auc_score from sklearn.feature_extraction import DictVectorizer as DV #df=pd.read_table("test.txt",sep=",",header=None) #df=pd.read_csv("test.txt",sep=",") #df=pd.read_csv("test.txt",sep=",",header=None) #print df df=pd.read_csv("data_first_false_select.dat",sep=",",header=None,dtype={5:'f2'},na_values=["−"]).replace("新築",0) #print df.count #exit() #quit() #df=pd.read_csv("test2.txt",sep=",") print "origin" print df y=pd.DataFrame(df[[1]]) x=df[range(2,(df.shape[1]))] print "x" print x #y=df[[0]] #x=df[range(1,(df.shape[1]))] #quit() #exit() cat_key=[3,4,7,9,12,13,14,15] cat_x=x[cat_key].fillna("No_data") #print cat_x #quit() #exit() cat_dict=cat_x.T.to_dict().values() vectorizer = DV( sparse = False ) vec_x_cat_train = vectorizer.fit_transform(cat_dict) vec_x_names=vectorizer.get_feature_names() x_drop=x.drop(cat_key,axis=1) print x_drop.isin([np.inf]).any() #exit() x_drop=x_drop.replace(np.inf,np.nan) x_drop=x_drop.fillna(x_drop.median(col=0)) #x_drop=x_drop.replace(np.nan,10000000) print x_drop.describe() #xx=x_drop[[5]] #print xx[xx>=10] #print x_drop print x_drop.notnull().all() print x_drop[8] #exit() x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names),left_index=True,right_index=True) #print x.apply(isnull()) #x=pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names) #x=pd.DataFrame(np.log(x_drop[[2]])) #Fee #print x_drop[[6]] #Ratio of Area and fee x=pd.DataFrame(np.log(x_drop[2]/x_drop[6])) #x=pd.merge(x,pd.DataFrame(np.log(x_drop[2]/x_drop[6])),left_index=True,right_index=True) #print x.describe() #x=pd.DataFrame(np.log(x_drop[2]/x_drop[6])) #print x_drop.describe() #exit() #print x.count #quit() #exit() #y=df[:2] #x=df.drop("y",axis=1) #x=df.drop("y",axis=1) #y=df[[1]] #print 2:(df.shape[1]-1) #x=df[range(2,(df.shape[1]))] #exit() #print y #x=df.drop("y",axis=1) #x=df[["x1","x2","x3"]] #x=df[[1,2,3]] #y=y[x.notnull()] #x=x[x.notnull()] print "Target:" print y print "Explain" print x y=y.values.reshape((y.shape[0],)) print x print type(x[0]) #exit() #quit() cats=pd.qcut(x[0],30) print cats print pd.Series(y).groupby(cats).mean() #exit() #quit() x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1, random_state=0) #x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.001, random_state=0) #exit() print y_train.shape print x_train.shape print y_train #exit(); cf1=LogisticRegression(C=1.0,penalty="l2") cf1.fit(x_train,y_train) #cf1=LogisticRegression(C=1.0,penalty="l2") #cf1.fit(x_train,y_train) #exit() y_test_pred=cf1.predict(x_test) y_train_pred=cf1.predict(x_train) y_test_prob=(cf1.predict_proba(x_test)[:,1]) y_train_prob=(cf1.predict_proba(x_train)[:,1]) print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred)) print "Score:" print cf1.score(x_test,y_test) print "Confusion_matrix(test train):" print "test" print confusion_matrix(y_test, y_test_pred) print "train" print confusion_matrix(y_train, y_train_pred) print "Classification_report (test train):" print "test:" print classification_report(y_test, y_test_pred) print "train:" print classification_report(y_train, y_train_pred) #AUC print "AUC (test train):" fpr, tpr, _ = roc_curve(y_test, y_test_prob) auc=roc_auc_score(y_test, y_test_prob) auc_train=roc_auc_score(y_train, y_train_prob) print auc,auc_train print "AR (test train):" print 2*auc-1,2*auc_train-1 #print("Accuracy: %.2f" % (y_test!=y_pred).sum()) print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob}) print "Coefficients:" print pd.DataFrame(cf1.coef_,columns=x.keys())
dictをarrayに変換
In [29] from sklearn.feature_extraction import DictVectorizer
In [30]: features=[{"name":"taro","sex":"M"},{"name":"jiro","sex":"F"}]
In [31]: array_vectors = vec.fit_transform(features).toarray()
In [32]: vec = DictVectorizer()
In [33]: vec
Out[33]:
DictVectorizer(dtype=, separator='=', sort=True,
sparse=True)In [34]: array_vectors = vec.fit_transform(features).toarray()
In [35]: array_vectors
Out[35]:
array([[ 0., 1., 0., 1.],
[ 1., 0., 1., 0.]])In [36]: vec.get_feature_names()
Out[36]: ['name=jiro', 'name=taro', 'sex=F', 'sex=M']
データフレームから辞書
In [42]: df=pd.read_csv("test2.txt",sep=",")
In [43]: df
Out[43]:
y x1 x2 x3 x4 x5
0 0 2 1 -8 M T
1 0 5 6 7 M O
2 0 2 10 -12 F N
3 0 3 1 3 M N
4 0 3 1 4 F O
5 1 3 10 -11 M N
6 1 4 13 -12 M N
7 1 3 12 -11 M O
8 1 2 11 -15 F T
9 1 1 11 -16 F T
10 1 3 16 -11 F OIn [44]: cat_df=df"x4","x5"
In [45]: cat_df
Out[45]:
x4 x5
0 M T
1 M O
2 F N
3 M N
4 F O
5 M N
6 M N
7 M O
8 F T
9 F T
10 F OIn [46]: cat_dict=cat_df.T.to_dict().values()
In [47]: cat_dict
Out[47]:
[{'x4': 'M', 'x5': 'T'},
{'x4': 'M', 'x5': 'O'},
{'x4': 'F', 'x5': 'N'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'F', 'x5': 'O'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'M', 'x5': 'O'},
{'x4': 'F', 'x5': 'T'},
{'x4': 'F', 'x5': 'T'},
{'x4': 'F', 'x5': 'O'}]
辞書から新しいデータフレーム
In [102]: from sklearn.feature_extraction import DictVectorizer as DV In [102]: vectorizer = DV( sparse = False ) In [103]: vec_x_cat_train = vectorizer.fit_transform( cat_dict) In [104]: vec_x_cat_train Out[104]: array([[ 0., 1., 0., 0., 1.], [ 0., 1., 0., 1., 0.], [ 1., 0., 1., 0., 0.], [ 0., 1., 1., 0., 0.], [ 1., 0., 0., 1., 0.], [ 0., 1., 1., 0., 0.], [ 0., 1., 1., 0., 0.], [ 0., 1., 0., 1., 0.], [ 1., 0., 0., 0., 1.], [ 1., 0., 0., 0., 1.], [ 1., 0., 0., 1., 0.]]) In [105]: x=df.drop(["x4","x5"],axis=1) In [106]: x Out[106]: y x1 x2 x3 0 0 2 1 -8 1 0 5 6 7 2 0 2 10 -12 3 0 3 1 3 4 0 3 1 4 5 1 3 10 -11 6 1 4 13 -12 7 1 3 12 -11 8 1 2 11 -15 9 1 1 11 -16 10 1 3 16 -11 In [107]: x2=pd.merge(pd.DataFrame(x,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index),left_index=True,right_index=True) In [108]: x2 Out[108]: y x1 x2 x3 0 1 2 3 4 0 0 2 1 -8 0 1 0 0 1 1 0 5 6 7 0 1 0 1 0 2 0 2 10 -12 1 0 1 0 0 3 0 3 1 3 0 1 1 0 0 4 0 3 1 4 1 0 0 1 0 5 1 3 10 -11 0 1 1 0 0 6 1 4 13 -12 0 1 1 0 0 7 1 3 12 -11 0 1 0 1 0 8 1 2 11 -15 1 0 0 0 1 9 1 1 11 -16 1 0 0 0 1 10 1 3 16 -11 1 0 0 1 0 In [116]: vectorizer.get_feature_names() Out[116]: ['x4=F', 'x4=M', 'x5=N', 'x5=O', 'x5=T']
カテゴリカルデータのやつをまとめたやつ
29 cat_key=["x4","x5"]$
27 y=df0$
28 x=df[range(1,(df.shape[1]))]$
30 cat_x=x[cat_key]$
31 cat_dict=cat_x.T.to_dict().values()$
32 vectorizer = DV( sparse = False )$
33 vec_x_cat_train = vectorizer.fit_transform(cat_dict)$
34 vec_x_names=vectorizer.get_feature_names()$
35 x_drop=x.drop(cat_key,axis=1)$
36 x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,i ndex=x.index,columns=vec_x_names),left_index=True,right_index=True)$
平均値で補完
In [363]: b=pd.DataFrame({"a":[1,3,5],"b":[1,4,None],"c":[10,None,32]})
In [364]: b
Out[364]:
a b c
0 1 1 10
1 3 4 NaN
2 5 NaN 32In [365]: b.fillna(b.mean(axis=0))
Out[365]:
a b c
0 1 1.0 10
1 3 4.0 21
2 5 2.5 32
infとnaを平均値でうめる
In [415]: b.replace(np.inf,np.nan).fillna(b.replace(np.inf,np.nan).mean())
Out[415]:
a c
0 1 3
1 2 4
2 3 4
3 2 5
離散化、ビンに切って、ビニングして、カテゴリごとに集計
x[0]として、pd.Seres化することに注意 x0だと、DataFrame。
cats=pd.qcut(x[0],30)
print cats
print pd.Series(y).groupby(cats).mean()
便利関数群
http://blog.brainpad.co.jp/entry/2014/12/10/204111 より
列ごとに集計してくれる
#データの冒頭5行を取得
df.head(5)#データの末尾5行を取得
df.tail(5)#データの型の確認
df.info()#データの件数や統計量を確認
df.describe()#データの型の確認
type(df)#列のデータ型の確認
type(df['price'])
参考:
http://techblog.yahoo.co.jp/datascience/use_scikit-learn/
http://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
http://qiita.com/gash717/items/cd9b97a9d26f6ec90df3
http://aidiary.hatenablog.com/entry/20150826/1440596779
AUC
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
カテゴリカルデータの扱い
- dataframeからdict
Converting categorical data into numbers with Pandas and Scikit-learn - FastML http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/ … via @fastml
- dict をarrayに変換
http://qiita.com/fukkyy/items/918b97cb0becb65f85ce
How to calculate p-value
http://stackoverflow.com/questions/24255723/sklearn-logistic-regression-important-features
statmodels
http://statsmodels.sourceforge.net