scikit-learnでロジスティック回帰

コード
$ cat learn_test.py

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score


#df=pd.read_table("test.txt",sep=",",header=None)
df=pd.read_csv("test.txt",sep=",")

y=df.y
x=df.drop("y",axis=1)
#x=df[["x1","x2","x3"]]
#x=df[[1,2,3]]

print "Target:"
print y
print "Explain"
print x
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

cf1=LogisticRegression()
cf1.fit(x_train,y_train)


y_test_pred=cf1.predict(x_test)
y_test_prob=(cf1.predict_proba(x_test)[:,1])

print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred))
#print("Accuracy: %.2f" % (y_test!=y_pred).sum())

print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob})

print "Coefficients:"
print pd.DataFrame(cf1.coef_,columns=x.keys())

入力ファイル
$ cat test.txt

y,x1,x2,x3
0,2,1,-8
0,5,6,7
0,2,10,-12
0,3,1,3
0,3,1,4
1,3,10,-11
1,4,13,-12
1,3,12,-11
1,2,11,-15
1,1,11,-16
1,3,16,-11

出力:
$ python learn_test.py

Target:
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Name: y, dtype: int64
Explain
x1 x2 x3
0 2 1 -8
1 5 6 7
2 2 10 -12
3 3 1 3
4 3 1 4
5 3 10 -11
6 4 13 -12
7 3 12 -11
8 2 11 -15
9 1 11 -16
10 3 16 -11
Accuracy: 0.75
Anser Prediction Prob
4 0 0 0.062660
9 1 1 0.995225
2 0 1 0.975860
10 1 1 0.996505
Coefficients:
x1 x2 x3
0 -0.852768 0.48049 -0.076725


さまざまな評価指標を加えたもの

$ cat learn_test.py
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

#df=pd.read_table("test.txt",sep=",",header=None)
df=pd.read_csv("test.txt",sep=",")


y=df.y
x=df.drop("y",axis=1)
#x=df[["x1","x2","x3"]]
#x=df[[1,2,3]]

print "Target:"
print y
print "Explain"
print x
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

cf1=LogisticRegression()
cf1.fit(x_train,y_train)



y_test_pred=cf1.predict(x_test)
y_test_prob=(cf1.predict_proba(x_test)[:,1])

print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred))
print "Score:"
print cf1.score(x_test,y_test)
print "Confusion_matrix:"
print confusion_matrix(y_test, y_test_pred)
print "Classification_report:"

print classification_report(y_test, y_test_pred)

#AUC
print "AUC:"
fpr, tpr, _ = roc_curve(y_test, y_test_pred)
auc=roc_auc_score(y_test, y_test_pred)
print auc
print "AR:"
print 2*auc-1

#print("Accuracy: %.2f" % (y_test!=y_pred).sum())

print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob})


print "Coefficients:"
print pd.DataFrame(cf1.coef_,columns=x.keys())

$ python learn_test.py

Target:
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Name: y, dtype: int64
Explain
x1 x2 x3
0 2 1 -8
1 5 6 7
2 2 10 -12
3 3 1 3
4 3 1 4
5 3 10 -11
6 4 13 -12
7 3 12 -11
8 2 11 -15
9 1 11 -16
10 3 16 -11
Accuracy: 0.75
Score:
0.75
Confusion_matrix:
[[1 1]
[0 2]]
Classification_report:
precision recall f1-score support

0 1.00 0.50 0.67 2
1 0.67 1.00 0.80 2

avg / total 0.83 0.75 0.73 4

AUC:
0.75
AR:
0.5
Anser Prediction Prob
4 0 0 0.062660
9 1 1 0.995225
2 0 1 0.975860
10 1 1 0.996505
Coefficients:
x1 x2 x3
0 -0.852768 0.48049 -0.076725

  • カテゴリカルの場合は、カテゴリデータの変換の必要あり

コード
$ cat learn2.py

#-*- coding: utf-8 -*-
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.feature_extraction import DictVectorizer as DV




#df=pd.read_table("test.txt",sep=",",header=None)
#df=pd.read_csv("test.txt",sep=",")

#df=pd.read_csv("test.txt",sep=",",header=None)
#print df
#df=pd.read_csv("data_first_false_select.dat",sep=",",header=None,dtype={5:'f2'},na_values=["−"],index_col=0)

df=pd.read_csv("test2.txt",sep=",")
print "origin"
print df
#y=df[[1]]
#x=df[range(2,(df.shape[1]))]

y=df[[0]]
x=df[range(1,(df.shape[1]))]
cat_key=["x4","x5"]
cat_x=x[cat_key]
cat_dict=cat_x.T.to_dict().values()
vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform(cat_dict)
vec_x_names=vectorizer.get_feature_names()
x_drop=x.drop(cat_key,axis=1)
x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names),left_index=True,right_index=True)


#print cat_x


#exit()
#quit()

#y=df[:2]

#x=df.drop("y",axis=1)
#x=df.drop("y",axis=1)

#y=df[[1]]
#print 2:(df.shape[1]-1)
#x=df[range(2,(df.shape[1]))]

#exit()

#print y
#x=df.drop("y",axis=1)
#x=df[["x1","x2","x3"]]
#x=df[[1,2,3]]

print "Target:"
print y
print "Explain"
print x

#exit()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
#exit()

cf1=LogisticRegression()
cf1.fit(x_train,y_train)



y_test_pred=cf1.predict(x_test)
y_train_pred=cf1.predict(x_train)
y_test_prob=(cf1.predict_proba(x_test)[:,1])
y_train_prob=(cf1.predict_proba(x_train)[:,1])


print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred))
print "Score:"
print cf1.score(x_test,y_test)
print "Confusion_matrix(test train):"
print "test"
print confusion_matrix(y_test, y_test_pred)
print "train"
print confusion_matrix(y_train, y_train_pred)
print "Classification_report (test train):"
print "test:"
print classification_report(y_test, y_test_pred)
print "train:"
print classification_report(y_train, y_train_pred)
#AUC
print "AUC (test train):"
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
auc=roc_auc_score(y_test, y_test_prob)
auc_train=roc_auc_score(y_train, y_train_prob)
print auc,auc_train
print "AR (test train):"
print 2*auc-1,2*auc_train-1

#print("Accuracy: %.2f" % (y_test!=y_pred).sum())

print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob})


print "Coefficients:"
print pd.DataFrame(cf1.coef_,columns=x.keys())


入力
$ cat test2.txt

y,x1,x2,x3,x4,x5
0,2,1,-8,M,T
0,5,6,7,M,O
0,2,10,-12,F,N
0,3,1,3,M,N
0,3,1,4,F,O
1,3,10,-11,M,N
1,4,13,-12,M,N
1,3,12,-11,M,O
1,2,11,-15,F,T
1,1,11,-16,F,T
1,3,16,-11,F,O

出力

$ python learn2.py
origin
y x1 x2 x3 x4 x5
0 0 2 1 -8 M T
1 0 5 6 7 M O
2 0 2 10 -12 F N
3 0 3 1 3 M N
4 0 3 1 4 F O
5 1 3 10 -11 M N
6 1 4 13 -12 M N
7 1 3 12 -11 M O
8 1 2 11 -15 F T
9 1 1 11 -16 F T
10 1 3 16 -11 F O
Target:
y
0 0
1 0
2 0
3 0
4 0
5 1
6 1
7 1
8 1
9 1
10 1
Explain
x1 x2 x3 x4=F x4=M x5=N x5=O x5=T
0 2 1 -8 0 1 0 0 1
1 5 6 7 0 1 0 1 0
2 2 10 -12 1 0 1 0 0
3 3 1 3 0 1 1 0 0
4 3 1 4 1 0 0 1 0
5 3 10 -11 0 1 1 0 0
6 4 13 -12 0 1 1 0 0
7 3 12 -11 0 1 0 1 0
8 2 11 -15 1 0 0 0 1
9 1 11 -16 1 0 0 0 1
10 3 16 -11 1 0 0 1 0
/usr/lib/python2.7/site-packages/sklearn/utils/validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Accuracy: 0.75
Score:
0.75
Confusion_matrix(test train):
test
[[1 1]
[0 2]]
train
[[3 0]
[0 4]]
Classification_report (test train):
test:
precision recall f1-score support

0 1.00 0.50 0.67 2
1 0.67 1.00 0.80 2

avg / total 0.83 0.75 0.73 4

train:
precision recall f1-score support

0 1.00 1.00 1.00 3
1 1.00 1.00 1.00 4

avg / total 1.00 1.00 1.00 7

AUC (test train):
1.0 1.0
AR (test train):
1.0 1.0
Anser Prediction Prob
0 (y,) 0 0.072001
1 (y,) 1 0.995499
2 (y,) 1 0.982635
3 (y,) 1 0.997178
Coefficients:
x1 x2 x3 x4=F x4=M x5=N x5=O \
0 -0.760449 0.46168 -0.099912 0.010561 -0.296954 0.01677 -0.061206

x5=T
0 -0.241957

実践編:

$ cat learn_aprt2.py
#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.feature_extraction import DictVectorizer as DV




#df=pd.read_table("test.txt",sep=",",header=None)
#df=pd.read_csv("test.txt",sep=",")

#df=pd.read_csv("test.txt",sep=",",header=None)
#print df
df=pd.read_csv("data_first_false_select.dat",sep=",",header=None,dtype={5:'f2'},na_values=["−"]).replace("新築",0)

#print df.count
#exit()
#quit()
#df=pd.read_csv("test2.txt",sep=",")
print "origin"
print df
y=pd.DataFrame(df[[1]])
x=df[range(2,(df.shape[1]))]
print "x"
print x
#y=df[[0]]
#x=df[range(1,(df.shape[1]))]
#quit()
#exit()

cat_key=[3,4,7,9,12,13,14,15]
cat_x=x[cat_key].fillna("No_data")
#print cat_x
#quit()
#exit()
cat_dict=cat_x.T.to_dict().values()
vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform(cat_dict)
vec_x_names=vectorizer.get_feature_names()
x_drop=x.drop(cat_key,axis=1)
print x_drop.isin([np.inf]).any()
#exit()

x_drop=x_drop.replace(np.inf,np.nan)
x_drop=x_drop.fillna(x_drop.median(col=0))

#x_drop=x_drop.replace(np.nan,10000000)


print x_drop.describe()
#xx=x_drop[[5]]
#print xx[xx>=10]
#print x_drop
print x_drop.notnull().all()
print x_drop[8]
#exit()
x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names),left_index=True,right_index=True)


#print x.apply(isnull())

#x=pd.DataFrame(vec_x_cat_train,index=x.index,columns=vec_x_names)

#x=pd.DataFrame(np.log(x_drop[[2]]))
#Fee
#print x_drop[[6]]
#Ratio of Area and fee
x=pd.DataFrame(np.log(x_drop[2]/x_drop[6]))

#x=pd.merge(x,pd.DataFrame(np.log(x_drop[2]/x_drop[6])),left_index=True,right_index=True)


#print x.describe()

#x=pd.DataFrame(np.log(x_drop[2]/x_drop[6]))


#print x_drop.describe()
#exit()

#print x.count
#quit()
#exit()

#y=df[:2]

#x=df.drop("y",axis=1)
#x=df.drop("y",axis=1)

#y=df[[1]]
#print 2:(df.shape[1]-1)
#x=df[range(2,(df.shape[1]))]

#exit()

#print y
#x=df.drop("y",axis=1)
#x=df[["x1","x2","x3"]]
#x=df[[1,2,3]]

#y=y[x.notnull()]
#x=x[x.notnull()]

print "Target:"
print y
print "Explain"
print x
y=y.values.reshape((y.shape[0],))
print x
print type(x[0])
#exit()
#quit()
cats=pd.qcut(x[0],30)
print cats
print pd.Series(y).groupby(cats).mean()

#exit()
#quit()
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1, random_state=0)

#x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.001, random_state=0)
#exit()

print y_train.shape
print x_train.shape
print y_train
#exit();
cf1=LogisticRegression(C=1.0,penalty="l2")
cf1.fit(x_train,y_train)


#cf1=LogisticRegression(C=1.0,penalty="l2")
#cf1.fit(x_train,y_train)

#exit()


y_test_pred=cf1.predict(x_test)
y_train_pred=cf1.predict(x_train)
y_test_prob=(cf1.predict_proba(x_test)[:,1])
y_train_prob=(cf1.predict_proba(x_train)[:,1])


print("Accuracy: %.2f" % accuracy_score(y_test,y_test_pred))
print "Score:"
print cf1.score(x_test,y_test)
print "Confusion_matrix(test train):"
print "test"
print confusion_matrix(y_test, y_test_pred)
print "train"
print confusion_matrix(y_train, y_train_pred)
print "Classification_report (test train):"
print "test:"
print classification_report(y_test, y_test_pred)
print "train:"
print classification_report(y_train, y_train_pred)
#AUC
print "AUC (test train):"
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
auc=roc_auc_score(y_test, y_test_prob)
auc_train=roc_auc_score(y_train, y_train_prob)
print auc,auc_train
print "AR (test train):"
print 2*auc-1,2*auc_train-1

#print("Accuracy: %.2f" % (y_test!=y_pred).sum())

print pd.DataFrame({"Anser": y_test,"Prediction": y_test_pred, "Prob": y_test_prob})


print "Coefficients:"
print pd.DataFrame(cf1.coef_,columns=x.keys())

dictをarrayに変換

In [29] from sklearn.feature_extraction import DictVectorizer

In [30]: features=[{"name":"taro","sex":"M"},{"name":"jiro","sex":"F"}]

In [31]: array_vectors = vec.fit_transform(features).toarray()

In [32]: vec = DictVectorizer()

In [33]: vec
Out[33]:
DictVectorizer(dtype=, separator='=', sort=True,
sparse=True)

In [34]: array_vectors = vec.fit_transform(features).toarray()

In [35]: array_vectors
Out[35]:
array([[ 0., 1., 0., 1.],
[ 1., 0., 1., 0.]])

In [36]: vec.get_feature_names()
Out[36]: ['name=jiro', 'name=taro', 'sex=F', 'sex=M']

データフレームから辞書

In [42]: df=pd.read_csv("test2.txt",sep=",")

In [43]: df
Out[43]:
y x1 x2 x3 x4 x5
0 0 2 1 -8 M T
1 0 5 6 7 M O
2 0 2 10 -12 F N
3 0 3 1 3 M N
4 0 3 1 4 F O
5 1 3 10 -11 M N
6 1 4 13 -12 M N
7 1 3 12 -11 M O
8 1 2 11 -15 F T
9 1 1 11 -16 F T
10 1 3 16 -11 F O

In [44]: cat_df=df"x4","x5"

In [45]: cat_df
Out[45]:
x4 x5
0 M T
1 M O
2 F N
3 M N
4 F O
5 M N
6 M N
7 M O
8 F T
9 F T
10 F O

In [46]: cat_dict=cat_df.T.to_dict().values()

In [47]: cat_dict
Out[47]:
[{'x4': 'M', 'x5': 'T'},
{'x4': 'M', 'x5': 'O'},
{'x4': 'F', 'x5': 'N'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'F', 'x5': 'O'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'M', 'x5': 'N'},
{'x4': 'M', 'x5': 'O'},
{'x4': 'F', 'x5': 'T'},
{'x4': 'F', 'x5': 'T'},
{'x4': 'F', 'x5': 'O'}]

辞書から新しいデータフレーム

In [102]: from sklearn.feature_extraction import DictVectorizer as DV
In [102]:   vectorizer = DV( sparse = False )
In [103]: vec_x_cat_train = vectorizer.fit_transform( cat_dict)

In [104]: vec_x_cat_train
Out[104]:
array([[ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  1.,  0.]])

In [105]: x=df.drop(["x4","x5"],axis=1)

In [106]: x
Out[106]:
    y  x1  x2  x3
0   0   2   1  -8
1   0   5   6   7
2   0   2  10 -12
3   0   3   1   3
4   0   3   1   4
5   1   3  10 -11
6   1   4  13 -12
7   1   3  12 -11
8   1   2  11 -15
9   1   1  11 -16
10  1   3  16 -11

In [107]: x2=pd.merge(pd.DataFrame(x,index=x.index),pd.DataFrame(vec_x_cat_train,index=x.index),left_index=True,right_index=True)

In [108]: x2
Out[108]:
    y  x1  x2  x3  0  1  2  3  4
0   0   2   1  -8  0  1  0  0  1
1   0   5   6   7  0  1  0  1  0
2   0   2  10 -12  1  0  1  0  0
3   0   3   1   3  0  1  1  0  0
4   0   3   1   4  1  0  0  1  0
5   1   3  10 -11  0  1  1  0  0
6   1   4  13 -12  0  1  1  0  0
7   1   3  12 -11  0  1  0  1  0
8   1   2  11 -15  1  0  0  0  1
9   1   1  11 -16  1  0  0  0  1
10  1   3  16 -11  1  0  0  1  0


In [116]: vectorizer.get_feature_names()
Out[116]: ['x4=F', 'x4=M', 'x5=N', 'x5=O', 'x5=T']

カテゴリカルデータのやつをまとめたやつ

29 cat_key=["x4","x5"]$

27 y=df0$
28 x=df[range(1,(df.shape[1]))]$
30 cat_x=x[cat_key]$
31 cat_dict=cat_x.T.to_dict().values()$
32 vectorizer = DV( sparse = False )$
33 vec_x_cat_train = vectorizer.fit_transform(cat_dict)$
34 vec_x_names=vectorizer.get_feature_names()$
35 x_drop=x.drop(cat_key,axis=1)$
36 x=pd.merge(pd.DataFrame(x_drop,index=x.index),pd.DataFrame(vec_x_cat_train,i ndex=x.index,columns=vec_x_names),left_index=True,right_index=True)$

平均値で補完

In [363]: b=pd.DataFrame({"a":[1,3,5],"b":[1,4,None],"c":[10,None,32]})

In [364]: b
Out[364]:
a b c
0 1 1 10
1 3 4 NaN
2 5 NaN 32

In [365]: b.fillna(b.mean(axis=0))
Out[365]:
a b c
0 1 1.0 10
1 3 4.0 21
2 5 2.5 32


infとnaを平均値でうめる

In [415]: b.replace(np.inf,np.nan).fillna(b.replace(np.inf,np.nan).mean())
Out[415]:
a c
0 1 3
1 2 4
2 3 4
3 2 5

離散化、ビンに切って、ビニングして、カテゴリごとに集計
x[0]として、pd.Seres化することに注意 x0だと、DataFrame。

cats=pd.qcut(x[0],30)
print cats
print pd.Series(y).groupby(cats).mean()

便利関数群
http://blog.brainpad.co.jp/entry/2014/12/10/204111 より
列ごとに集計してくれる

#データの冒頭5行を取得
df.head(5)

#データの末尾5行を取得
df.tail(5)

#データの型の確認
df.info()

#データの件数や統計量を確認
df.describe()

#データの型の確認
type(df)

#列のデータ型の確認
type(df['price'])

参考:
http://techblog.yahoo.co.jp/datascience/use_scikit-learn/
http://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
http://qiita.com/gash717/items/cd9b97a9d26f6ec90df3
http://aidiary.hatenablog.com/entry/20150826/1440596779

AUC
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

カテゴリカルデータの扱い

  • dataframeからdict

Converting categorical data into numbers with Pandas and Scikit-learn - FastML http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/ … via @fastml

  • dict をarrayに変換

http://qiita.com/fukkyy/items/918b97cb0becb65f85ce


How to calculate p-value
http://stackoverflow.com/questions/24255723/sklearn-logistic-regression-important-features

statmodels
http://statsmodels.sourceforge.net