名詞をひとまとめにした形態素解析 mecab
一番したのsurface版を使う。 featureだと未知語を無視する問題がある.
複合語について
http://kw.kait.jp/opac/kkb-022-021._;jsessionid=227623429A2954BED6FEE927770BEEC4?key=DNHDTD
http://www.nihongokyoshi.co.jp/manbow/manbow.php?id=832&TAB=1 より
複合名詞
言語一般<日本語の構造>
統語構造の型は、名詞+動詞連用形(「雨降り」「絵かき」「山越え」など)、動詞連用形+動詞連用形(「建てかけ」「つりだし」「建て売り」など)、形容動詞語幹+動詞連用形(「馬鹿騒ぎ」「にわか仕込み」「無理押し」など)、形容詞語幹+動詞連用形(「浅漬け」「悔し泣き」「悪のり」など)がある。特に、名詞+動詞連用形から成る複合名詞は、その中にさまざまな格関係が見られる。並列構造の型は、類義成分の並列(「手足」など)と対義成分の並列(「天地」など)がある。
複合語パタン
http://members3.jcom.home.ne.jp/balloon_rhetoric/example/compound.html
IPA辞書のご遺体系
マニュアル
http://chasen.naist.jp/snapshot/ipadic/ipadic/doc/ipadic-ja.pdf
形態素解析ツールの品詞体系
http://www.unixuser.org/~euske/doc/postag/
def get_feature_from_text_continuous(text1,mecab) list1=[] node=mecab.parseToNode(text1) list_continuous_noun=[nil] while node.next != nil do node=node.next word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub ("\"","") morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8" ) if morphology != "名詞" list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] else list_continuous_noun.push(word) end end list1.push(list_continuous_noun.join("")) list1.select!{|a| a.length >=1}.delete("*") list1 end text1="塩麹を食べたい" list1=get_feature_from_text_continuous(text1,m) p list1
ワードカウントに応用
cat noun_tot_count.rb require 'MeCab' def get_feature_from_text_continuous(text1,mecab) list1=[] node=mecab.parseToNode(text1) list_continuous_noun=[nil] while node.next != nil do node=node.next word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8") if morphology != "名詞" list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] else list_continuous_noun.push(word) end end list1.push(list_continuous_noun.join("")) list1.select!{|a| a.length >=1}.delete("*") list1 end f2=open('buumu_tot_kanrenbun3_str.csv') m=MeCab::Tagger.new() #k=0; count=Hash.new(0) #f_out=open("buumu_count.csv","w") f_out=open("~/tmp1","w") #f_out=open("buumu_noun_list_count.csv","w") iii=0 f2.each{|i| i.chomp! puts i text_list=i.split("NNEEWWLLIINNEE") text_list.each{|j| j.chomp! list1=get_feature_from_text_continuous(j,m) #print list1 list1.each{|k| count[k]=count[k]+1 } } iii=iii+1 puts iii #if iii >= 100 then # break #end } count.keys.each{|i| f_out.puts i+"\t"+count[i].to_s }
cat noun_series6.rb
# -*- coding: Utf-8 -*- require 'MeCab' require 'set' require 'csv' #word_list="food_list0_b.txt" word_count_file="buumu_noun_list_count.csv" total_text_file='buumu_tot_kanrenbun3_str.csv' output_file_header="buumu_noun_list_count_year_otiho" f=open(word_list) food_list=[] id_list=[] m=MeCab::Tagger.new() list1=[] text1="おはよう。うさぎさん, うさぎ" def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8")) end list1 end def get_feature_from_text_continuous(text1,mecab) list1=[] node=mecab.parseToNode(text1) list_continuous_noun=[nil] while node.next != nil do node=node.next word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8") if morphology != "名詞" list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] else list_continuous_noun.push(word) end end list1.push(list_continuous_noun.join("")) #list1.each{|a| puts a.length } list1.select!{|a| a.length >=1}.delete("*") list1 end text1="塩麹を食べたい" list1=get_feature_from_text_continuous(text1,m) p list1 #exit #list1=get_feature_from_text2(text1,m) #puts list1 # str="うさぎ" #f2=open('buumu_tot_kanrenbun3_str.csv') #f2=open('tmp3') #k=0; #count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]} key_list=Set.new() CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]} #exit #year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"] year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"] f_out=Hash.new() year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")} ##f_out2=open("buumu_noun_list_count_year"+"first"+".csv","w") f2=open(total_text_file) count=Hash.new(0) first=Hash.new(0) key_list.each{|k| count[k]=0;first[k]=0 ;} year_old="2006" year="2006" f2.each{|i| #year_list.each{|year2| #GC.disable # if year2 == "first" then # count=Hash.new(0) # else #:w count=Hash.new(0) # key_list.each{|k| count[k]=0 } # end #f_out=open("buumu_noun_list_count_year"+year2+".csv","w") iii=0 i.chomp! date=i.split("\t")[0].split(" ")[0] year_old=year year=i.split("\t")[0].split(" ")[0].split("-")[0] # if year2==year or year2=="first" then puts year text_list=i.split("NNEEWWLLIINNEE") text_list.each{|j| j.chomp! list1=get_feature_from_text_continuous(j,m) #print list1 list1.each{|k| #if year2=="first" then # key_list.add(k) # count[k]=date #else if count[k]==0 then first[k]=date end count[k]=count[k]+1 #end } } iii=iii+1 #puts year2+","+iii.to_s if iii%1000==0 then GC.start #GC.disablea puts year2+" "+iii.to_s end # # puts i # puts j #if iii >= 100 then # #break #end #print count #exita if year_old != year count.keys.sort.each{|i| f_out[year_old].puts i.to_s+"\t"+count[i].to_s } count.keys.each{|iiii| count[i]=0} GC.start end # } #f2.close() # end } count.keys.sort.each{|i| f_out[year].puts i.to_s+"\t"+count[i].to_s f_out2.puts i.to_s+"\t"+first[i].to_s } #count.keys.each{|iiii| count[i]=0} #GC.start #f2.close()
未知語対応版
def get_feature_from_text_continuous(text1,mecab) list1=[] #puts text1 node=mecab.parseToNode(text1) #print mecab.parse(text1) #print node list_continuous_noun=[nil] #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") #puts "test" #puts word while node.next != nil do node=node.next #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") #word2=node.feature.split(",").each{|i| puts i} # word=node.surface.encode("UTF-8","UTF-8").gsub("\"","") # puts "test" # puts word morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8") # puts morphology if morphology != "名詞" word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] else list_continuous_noun.push(word) end end list1.push(list_continuous_noun.join("")) #list1.each{|a| puts a.length } list1.select!{|a| a.length >=1}.delete("*") list1 end
未知語対応版: 名詞-名詞、だけでなく、名詞-副詞-動詞(連用形) 「ちょい飲み居酒屋」などにも対応
get_feature_from_text_continuous が本体.
h.watanabe@HTL_WS_089 ~/syoku_ytyou_honbun2/buumu $ cat noun_series8.rb # -*- coding: Utf-8 -*- require 'MeCab' require 'set' require 'csv' #word_list="food_list0_b.txt" #word_count_file="buumu_noun_list_count.csv" #word_count_file="buumu_noun_list_count.csv" #word_count_file="buumu_word_hist.txt" #total_text_file='buumu_tot_kanrenbun3_str.csv' #total_text_file='buumu_tot_kanrenbun3_str.csv' #output_file_header="buumu_noun_list_count_year_otiho" total_text_file="buumu_new_2015-04-09_2015-06-21_kanrenbun3_str.csv" #output_file_header="/home/h.watanabe/buumu_word_hist_out" output_file_header="test_2_" #f=open(word_cou) #food_list=[] #id_list=[] m=MeCab::Tagger.new() list1=[] text1="おはよう。うさぎさん, うさぎ" def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8")) end list1 end def get_feature_from_text_continuous(text1,mecab) list1=[] node=mecab.parseToNode(text1) list_continuous_noun=[nil] hukushi_flg=false hukushi="" while node.next != nil do node=node.next #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","") word=node.surface.encode("UTF-8","UTF-8").gsub("\"","") morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8") morphology2=node.feature.split(",")[5].encode("UTF-8","UTF-8") #morphology2=node.feature puts morphology2 puts morphology #if morphology != "名詞" # # list1.push(word) # # list1.push(list_continuous_noun.join("")) # list_continuous_noun=[] #else # # list_continuous_noun.push(word) #end if morphology == "名詞" or (morphology=="動詞" and morphology2=="連用形") then if hukushi_flg and (morphology=="動詞" and morphology2=="連用形") then list_continuous_noun.push(hukushi) end list_continuous_noun.push(word) #list1.push(word) #list1.push(list_continuous_noun.join("")) #list_continuous_noun=[] else if hukushi_flg then list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] end if morphology !="副詞" then list1.push(word) list1.push(list_continuous_noun.join("")) list_continuous_noun=[] hukushi_flg=false hukushi="" else hukushi_flg=true hukushi=word end end #if morphology == "副詞" then # hukushi_flg=true # hukushi=word #else # hukushi_flg=false # hukushi="" #end end list1.push(list_continuous_noun.join("")) #list1.each{|a| puts a.length } list1.select!{|a| a.length >=1}.delete("*") list1 end #text1="塩麹を食べたい" #list1=get_feature_from_text_continuous(text1,m) #p list1 text1="ちょい飲みブーム" list1=get_feature_from_text_continuous(text1,m) p list1 exit #exit #list1=get_feature_from_text2(text1,m) #puts list1 # str="うさぎ" #f2=open('buumu_tot_kanrenbun3_str.csv') #f2=open('tmp3') #k=0; #count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]} #####key_list=Set.new() ###CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]} #exit #year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"] year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"] f_out=Hash.new() year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")} f_out2=open(output_file_header+"first"+".csv","w") f2=open(total_text_file) count=Hash.new(0) first=Hash.new(0) #key_list.each{|k| count[k]=0;first[k]=0 ;} year_old="-1" year="2006" f2.each{|i| #year_list.each{|year2| #GC.disable # if year2 == "first" then # count=Hash.new(0) # else #:w count=Hash.new(0) # key_list.each{|k| count[k]=0 } # end #f_out=open("buumu_noun_list_count_year"+year2+".csv","w") iii=0 i.chomp! date=i.split("\t")[0].split(" ")[0] #year_old=year year=i.split("\t")[0].split(" ")[0].split("-")[0] if year_old=="-1" then puts "koodayo" year_old=year else year_old=year end # if year2==year or year2=="first" then puts year+","+date text_list=i.split("NNEEWWLLIINNEE") text_list.each{|j| j.chomp! list1=get_feature_from_text_continuous(j,m) #print list1 list1.each{|k| #if year2=="first" then # key_list.add(k) # count[k]=date #else if count[k]==0 then first[k]=date end count[k]=count[k]+1 #end } } iii=iii+1 #puts year2+","+iii.to_s if iii%1000==0 then GC.start #GC.disablea puts year2+" "+iii.to_s end # # puts i # puts j #if iii >= 100 then # #break #end #print count #exita #print "test:"+"-"+year_old+"-"+year+"\n"; if year_old != year count.keys.sort.each{|iiii| f_out[year_old].puts iiii.to_s+"\t"+count[iiii].to_s } count.keys.each{|iiii| count[iiii]=0} GC.start end # } #f2.close() # end } count.keys.sort.each{|i| f_out[year].puts i.to_s+"\t"+count[i].to_s f_out2.puts i.to_s+"\t"+first[i].to_s } #count.keys.each{|iiii| count[i]=0} #GC.start #f2.close()
以下に対応したもの
- 名詞-名詞
- 副詞-動詞連用形
- 名詞接頭-名詞
- 動詞連用-動詞連勝
- 副詞-動詞連用形
- 形容詞ガル接続-名詞
- 形容詞ガル接続-動詞連用形 悔し笑い
- 形容詞+形容詞,接尾語は未対応.
$ cat noun_series8.rb
>|ruby|
# -*- coding: Utf-8 -*-
require 'MeCab'
require 'set'
require 'csv'
#word_list="food_list0_b.txt"
#word_count_file="buumu_noun_list_count.csv"
#word_count_file="buumu_noun_list_count.csv"
#word_count_file="buumu_word_hist.txt"
#total_text_file='buumu_tot_kanrenbun3_str.csv'
#total_text_file='buumu_tot_kanrenbun3_str.csv'
#output_file_header="buumu_noun_list_count_year_otiho"
total_text_file="buumu_new_2015-04-09_2015-06-21_kanrenbun3_str.csv"
#output_file_header="/home/h.watanabe/buumu_word_hist_out"
output_file_header="test_2_"
#f=open(word_cou)
#food_list=
#id_list=
m=MeCab::Tagger.new()
list1=
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)
list1=
node=mecab.parseToNode(text1)
while node.next != nil do
node=node.next
list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))
end
list1
end
def get_feature_from_text_continuous(text1,mecab)
list1=
node=mecab.parseToNode(text1)
list_continuous_noun=[nil]
hukushi_flg=false
keiyoushi_garu_flg=false
hukushi=""
keiyoushi=""
while node.next != nil do
node=node.next
#word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")
word=node.surface.encode("UTF-8","UTF-8").gsub("\"","")
morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")
morphology2=node.feature.split(",")[5].encode("UTF-8","UTF-8")
morphology3=node.feature.split(",")[1].encode("UTF-8","UTF-8")
#morphology2=node.feature
puts morphology2
# puts morphology
#if morphology != "名詞"
#
# list1.push(word)
#
# list1.push(list_continuous_noun.join(""))
# list_continuous_noun=
#else
#
# list_continuous_noun.push(word)
#end
if morphology == "名詞" or (morphology=="動詞" and morphology2=="連用形") or (morphology =="接頭詞" and morphology3=="名詞接続" ) then
if hukushi_flg and (morphology=="動詞" and morphology2=="連用形") then
list_continuous_noun.push(hukushi)
end
if keiyoushi_garu_flg and *1
#list_continuous_noun=
else
if hukushi_flg then
list1.push(word)
list1.push(list_continuous_noun.join(""))
list_continuous_noun=
end
# if morphology !="副詞" then
#
# list1.push(word)
# list1.push(list_continuous_noun.join(""))
# list_continuous_noun=
#
# hukushi_flg=false
# hukushi=""
#
# else
if morphology=="副詞" or (morphology =="形容詞" and morphology2=="ガル接続") then
if morphology=="副詞" then
hukushi_flg=true
hukushi=word
end
if morphology=="形容詞" then
keiyoushi_garu_flg=true
keiyoushi=word
end
else
list1.push(word)
list1.push(list_continuous_noun.join(""))
list_continuous_noun=
hukushi_flg=false
keiyoushi_garu_flg=false
hukushi=""
keiyoushi=""
end
end
#if morphology == "副詞" then
# hukushi_flg=true
# hukushi=word
#else
# hukushi_flg=false
# hukushi=""
#end
end
list1.push(list_continuous_noun.join(""))
#list1.each{|a| puts a.length }
list1.select!{|a| a.length >=1}.delete("*")
list1
end
#text1="塩麹を食べたい"
#list1=get_feature_from_text_continuous(text1,m)
#p list1
#text1="ちょい飲みブーム"
#text1="おごはん"
text1="悔し笑い"
list1=get_feature_from_text_continuous(text1,m)
p list1
exit
#exit
#list1=get_feature_from_text2(text1,m)
#puts list1
# str="うさぎ"
#f2=open('buumu_tot_kanrenbun3_str.csv')
#f2=open('tmp3')
#k=0;
#count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]}
#####key_list=Set.new()
###CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]}
#exit
#year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
f_out=Hash.new()
year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")}
f_out2=open(output_file_header+"first"+".csv","w")
f2=open(total_text_file)
count=Hash.new(0)
first=Hash.new(0)
#key_list.each{|k| count[k]=0;first[k]=0 ;}
year_old="-1"
year="2006"
f2.each{|i|
#year_list.each{|year2|
#GC.disable
# if year2 == "first" then
# count=Hash.new(0)
# else
#:w count=Hash.new(0)
# key_list.each{|k| count[k]=0 }
# end
#f_out=open("buumu_noun_list_count_year"+year2+".csv","w")
iii=0
i.chomp!
date=i.split("\t")[0].split(" ")[0]
#year_old=year
year=i.split("\t")[0].split(" ")[0].split("-")[0]
if year_old=="-1" then
puts "koodayo"
year_old=year
else
year_old=year
end
# if year2==year or year2=="first" then
puts year+","+date
text_list=i.split("NNEEWWLLIINNEE")
text_list.each{|j|
j.chomp!
list1=get_feature_from_text_continuous(j,m)
#print list1
list1.each{|k|
#if year2=="first" then
# key_list.add(k)
# count[k]=date
#else
if count[k]==0 then
first[k]=date
end
count[k]=count[k]+1
#end
}
}
iii=iii+1
#puts year2+","+iii.to_s
if iii%1000==0 then
GC.start
#GC.disablea
puts year2+" "+iii.to_s
end
#
# puts i
# puts j
#if iii >= 100 then
# #break
#end
#print count
#exita
#print "test:"+"-"+year_old+"-"+year+"\n";
if year_old != year
count.keys.sort.each{|iiii|
f_out[year_old].puts iiii.to_s+"\t"+count[iiii].to_s
}
count.keys.each{|iiii| count[iiii]=0}
GC.start
end
# }
#f2.close()
# end
}
count.keys.sort.each{|i|
f_out[year].puts i.to_s+"\t"+count[i].to_s
f_out2.puts i.to_s+"\t"+first[i].to_s
}
#count.keys.each{|iiii| count[i]=0}
#GC.start
|
*1:morphology=="動詞" and morphology2=="連用形") or morphology=="名詞") then list_continuous_noun.push(keiyoushi) end list_continuous_noun.push(word) #list1.push(word) #list1.push(list_continuous_noun.join(""