文章から連続する名詞をひとまとめにした形態素解析・複合語の形態素解析

名詞をひとまとめにした形態素解析 mecab
一番したのsurface版を使う。 featureだと未知語を無視する問題がある.

複合語について
http://kw.kait.jp/opac/kkb-022-021._;jsessionid=227623429A2954BED6FEE927770BEEC4?key=DNHDTD

http://www.nihongokyoshi.co.jp/manbow/manbow.php?id=832&TAB=1 より



複合名詞
言語一般<日本語の構造>
統語構造の型は、名詞+動詞連用形(「雨降り」「絵かき」「山越え」など)、動詞連用形+動詞連用形(「建てかけ」「つりだし」「建て売り」など)、形容動詞語幹+動詞連用形(「馬鹿騒ぎ」「にわか仕込み」「無理押し」など)、形容詞語幹+動詞連用形(「浅漬け」「悔し泣き」「悪のり」など)がある。特に、名詞+動詞連用形から成る複合名詞は、その中にさまざまな格関係が見られる。並列構造の型は、類義成分の並列(「手足」など)と対義成分の並列(「天地」など)がある。


複合語パタン
http://members3.jcom.home.ne.jp/balloon_rhetoric/example/compound.html
IPA辞書のご遺体系
マニュアル
http://chasen.naist.jp/snapshot/ipadic/ipadic/doc/ipadic-ja.pdf
形態素解析ツールの品詞体系
http://www.unixuser.org/~euske/doc/postag/


  def get_feature_from_text_continuous(text1,mecab)
 
          list1=[]
          node=mecab.parseToNode(text1)
          list_continuous_noun=[nil]
          while node.next != nil do
                  node=node.next
                  word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub    ("\"","")
 
                  morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8"    )
 
 
                  if morphology != "名詞"
 
                          list1.push(word)
 
                          list1.push(list_continuous_noun.join(""))
                          list_continuous_noun=[]
                  else
 
 
                          list_continuous_noun.push(word)
 
                  end
 
          end
 
          list1.push(list_continuous_noun.join(""))

          list1.select!{|a| a.length >=1}.delete("*")
          list1
  end


 
  text1="塩麹を食べたい"
  list1=get_feature_from_text_continuous(text1,m)
  p list1


ワードカウントに応用

cat noun_tot_count.rb
require 'MeCab'

  def get_feature_from_text_continuous(text1,mecab)

          list1=[]
          node=mecab.parseToNode(text1)
          list_continuous_noun=[nil]
          while node.next != nil do
                  node=node.next
                  word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")

                  morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")


                  if morphology != "名詞"

                          list1.push(word)

                          list1.push(list_continuous_noun.join(""))
                          list_continuous_noun=[]
                  else


                          list_continuous_noun.push(word)

                  end

          end

          list1.push(list_continuous_noun.join(""))

          list1.select!{|a| a.length >=1}.delete("*")
          list1
  end




 f2=open('buumu_tot_kanrenbun3_str.csv')
 m=MeCab::Tagger.new()
  #k=0;
  count=Hash.new(0)
  #f_out=open("buumu_count.csv","w")

  f_out=open("~/tmp1","w")
  #f_out=open("buumu_noun_list_count.csv","w")
  iii=0
  f2.each{|i|
          i.chomp!
                puts i
          text_list=i.split("NNEEWWLLIINNEE")
          text_list.each{|j|
                  j.chomp!

                  list1=get_feature_from_text_continuous(j,m)


                  #print list1
                  list1.each{|k|
                          count[k]=count[k]+1
                  }

          }
          iii=iii+1
          puts iii
          #if iii >= 100 then

          #       break
         #end
 }

 count.keys.each{|i|
         f_out.puts i+"\t"+count[i].to_s

 }


cat noun_series6.rb

# -*- coding: Utf-8 -*-

require 'MeCab'
require 'set'
require 'csv'


#word_list="food_list0_b.txt"



word_count_file="buumu_noun_list_count.csv"

total_text_file='buumu_tot_kanrenbun3_str.csv'

output_file_header="buumu_noun_list_count_year_otiho"
f=open(word_list)




food_list=[]
id_list=[]

m=MeCab::Tagger.new()
list1=[]
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        while node.next != nil do
                node=node.next
                list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))



        end
        list1
end




def get_feature_from_text_continuous(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        list_continuous_noun=[nil]
        while node.next != nil do
                node=node.next
                word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")

                morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")


                if morphology != "名詞"

                        list1.push(word)

                        list1.push(list_continuous_noun.join(""))
                        list_continuous_noun=[]
                else


                        list_continuous_noun.push(word)

                end

        end

        list1.push(list_continuous_noun.join(""))

        #list1.each{|a| puts a.length }
        list1.select!{|a| a.length >=1}.delete("*")
        list1
end

text1="塩麹を食べたい"
list1=get_feature_from_text_continuous(text1,m)
p list1


#exit

#list1=get_feature_from_text2(text1,m)
#puts list1
#       str="うさぎ"




#f2=open('buumu_tot_kanrenbun3_str.csv')


#f2=open('tmp3')
#k=0;
#count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]}



key_list=Set.new()
CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]}


#exit

#year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]

year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
f_out=Hash.new()
year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")}

##f_out2=open("buumu_noun_list_count_year"+"first"+".csv","w")
f2=open(total_text_file)

count=Hash.new(0)
first=Hash.new(0)
key_list.each{|k| count[k]=0;first[k]=0 ;}
year_old="2006"
year="2006"
f2.each{|i|
        #year_list.each{|year2|
                #GC.disable
        #       if year2 == "first" then
#               count=Hash.new(0)
        #       else

        #:w     count=Hash.new(0)
        #       key_list.each{|k| count[k]=0 }

        #       end

                #f_out=open("buumu_noun_list_count_year"+year2+".csv","w")
                iii=0
                i.chomp!

                date=i.split("\t")[0].split(" ")[0]
                year_old=year
                year=i.split("\t")[0].split(" ")[0].split("-")[0]
        #       if year2==year or year2=="first" then

                        puts year
                        text_list=i.split("NNEEWWLLIINNEE")
                        text_list.each{|j|
                                j.chomp!
                                list1=get_feature_from_text_continuous(j,m)
                                        #print list1
                                list1.each{|k|
                                        #if year2=="first" then
                                        #       key_list.add(k)
                                        #       count[k]=date
                                        #else

                                        if count[k]==0 then

                                                first[k]=date
                                        end

                                        count[k]=count[k]+1
                                        #end
                                }

                        }
                                iii=iii+1
                                #puts year2+","+iii.to_s
                                if iii%1000==0 then


                                        GC.start
                                        #GC.disablea
                                        puts year2+" "+iii.to_s

                                end
                #
                #               puts i
                #               puts j
                                #if iii >= 100 then

                                #       #break
                                #end
                #print count
                #exita
                if year_old != year
                        count.keys.sort.each{|i|
                                f_out[year_old].puts i.to_s+"\t"+count[i].to_s
                        }
                        count.keys.each{|iiii| count[i]=0}
                        GC.start
                end

        #       }

                #f2.close()
#       end
}

count.keys.sort.each{|i|
        f_out[year].puts i.to_s+"\t"+count[i].to_s

        f_out2.puts i.to_s+"\t"+first[i].to_s
}
#count.keys.each{|iiii| count[i]=0}
#GC.start

#f2.close()

未知語対応版

def get_feature_from_text_continuous(text1,mecab)

        list1=[]
        #puts text1
        node=mecab.parseToNode(text1)
        #print mecab.parse(text1)
        #print node
        list_continuous_noun=[nil]

        #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")
        #puts "test"
        #puts word
        while node.next != nil do
                node=node.next
                #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")

                #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")
                #word2=node.feature.split(",").each{|i| puts i}
#


                word=node.surface.encode("UTF-8","UTF-8").gsub("\"","")
        #       puts "test"
        #       puts word
                morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")

        #       puts morphology

                if morphology != "名詞"

                        word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")
                        list1.push(word)

                        list1.push(list_continuous_noun.join(""))
                        list_continuous_noun=[]
                else


                        list_continuous_noun.push(word)

                end

        end

        list1.push(list_continuous_noun.join(""))

        #list1.each{|a| puts a.length }
        list1.select!{|a| a.length >=1}.delete("*")
        list1
end

未知語対応版: 名詞-名詞、だけでなく、名詞-副詞-動詞(連用形) 「ちょい飲み居酒屋」などにも対応
get_feature_from_text_continuous が本体.


h.watanabe@HTL_WS_089 ~/syoku_ytyou_honbun2/buumu
$ cat noun_series8.rb
# -*- coding: Utf-8 -*-

require 'MeCab'
require 'set'
require 'csv'


#word_list="food_list0_b.txt"



#word_count_file="buumu_noun_list_count.csv"


#word_count_file="buumu_noun_list_count.csv"
#word_count_file="buumu_word_hist.txt"

#total_text_file='buumu_tot_kanrenbun3_str.csv'

#total_text_file='buumu_tot_kanrenbun3_str.csv'
#output_file_header="buumu_noun_list_count_year_otiho"
total_text_file="buumu_new_2015-04-09_2015-06-21_kanrenbun3_str.csv"


#output_file_header="/home/h.watanabe/buumu_word_hist_out"

output_file_header="test_2_"
#f=open(word_cou)




#food_list=[]
#id_list=[]

m=MeCab::Tagger.new()
list1=[]
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        while node.next != nil do
                node=node.next
                list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))



        end
        list1
end


def get_feature_from_text_continuous(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        list_continuous_noun=[nil]
        hukushi_flg=false
        hukushi=""
        while node.next != nil do
                node=node.next
                #word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")

                word=node.surface.encode("UTF-8","UTF-8").gsub("\"","")
                morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")

                morphology2=node.feature.split(",")[5].encode("UTF-8","UTF-8")
                #morphology2=node.feature
                puts morphology2

                puts morphology
                #if morphology != "名詞"
                #
                #       list1.push(word)
                #
                #       list1.push(list_continuous_noun.join(""))
                #       list_continuous_noun=[]
                #else

                #
                #       list_continuous_noun.push(word)

                #end



                if morphology == "名詞" or (morphology=="動詞" and morphology2=="連用形")  then
                        if hukushi_flg and (morphology=="動詞" and morphology2=="連用形") then
                                list_continuous_noun.push(hukushi)
                        end

                        list_continuous_noun.push(word)
                        #list1.push(word)
                        #list1.push(list_continuous_noun.join(""))
                        #list_continuous_noun=[]
                else
                        if hukushi_flg then

                                list1.push(word)
                                list1.push(list_continuous_noun.join(""))
                                list_continuous_noun=[]


                        end

                        if morphology !="副詞" then
                                list1.push(word)
                                list1.push(list_continuous_noun.join(""))
                                list_continuous_noun=[]

                                hukushi_flg=false
                                hukushi=""

                        else

                                hukushi_flg=true
                                hukushi=word

                        end

                end

                #if morphology == "副詞" then

                #       hukushi_flg=true
                #       hukushi=word
                #else
                #       hukushi_flg=false
                #       hukushi=""
                #end

        end

        list1.push(list_continuous_noun.join(""))

        #list1.each{|a| puts a.length }
        list1.select!{|a| a.length >=1}.delete("*")
        list1
end

#text1="塩麹を食べたい"
#list1=get_feature_from_text_continuous(text1,m)
#p list1

text1="ちょい飲みブーム"
list1=get_feature_from_text_continuous(text1,m)
p list1
exit

#exit

#list1=get_feature_from_text2(text1,m)
#puts list1
#       str="うさぎ"




#f2=open('buumu_tot_kanrenbun3_str.csv')


#f2=open('tmp3')
#k=0;
#count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]}



#####key_list=Set.new()
###CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]}


#exit

#year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]

year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
f_out=Hash.new()
year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")}

f_out2=open(output_file_header+"first"+".csv","w")
f2=open(total_text_file)

count=Hash.new(0)
first=Hash.new(0)
#key_list.each{|k| count[k]=0;first[k]=0 ;}
year_old="-1"
year="2006"
f2.each{|i|
        #year_list.each{|year2|
                #GC.disable
        #       if year2 == "first" then
#               count=Hash.new(0)
        #       else

        #:w     count=Hash.new(0)
        #       key_list.each{|k| count[k]=0 }

        #       end

                #f_out=open("buumu_noun_list_count_year"+year2+".csv","w")
                iii=0
                i.chomp!

                date=i.split("\t")[0].split(" ")[0]
                #year_old=year
                year=i.split("\t")[0].split(" ")[0].split("-")[0]
                if year_old=="-1" then
                        puts "koodayo"
                        year_old=year
                else


                        year_old=year
                end

        #       if year2==year or year2=="first" then

                        puts year+","+date
                        text_list=i.split("NNEEWWLLIINNEE")
                        text_list.each{|j|
                                j.chomp!
                                list1=get_feature_from_text_continuous(j,m)
                                        #print list1
                                list1.each{|k|
                                        #if year2=="first" then
                                        #       key_list.add(k)
                                        #       count[k]=date
                                        #else

                                        if count[k]==0 then

                                                first[k]=date
                                        end

                                        count[k]=count[k]+1
                                        #end
                                }

                        }
                                iii=iii+1
                                #puts year2+","+iii.to_s
                                if iii%1000==0 then


                                        GC.start
                                        #GC.disablea
                                        puts year2+" "+iii.to_s

                                end
                #
                #               puts i
                #               puts j
                                #if iii >= 100 then

                                #       #break
                                #end
                #print count
                #exita
                #print "test:"+"-"+year_old+"-"+year+"\n";
                if year_old != year
                        count.keys.sort.each{|iiii|
                                f_out[year_old].puts iiii.to_s+"\t"+count[iiii].to_s
                        }
                        count.keys.each{|iiii| count[iiii]=0}
                        GC.start
                end

        #       }

                #f2.close()
#       end
}

count.keys.sort.each{|i|
        f_out[year].puts i.to_s+"\t"+count[i].to_s

        f_out2.puts i.to_s+"\t"+first[i].to_s
}
#count.keys.each{|iiii| count[i]=0}
#GC.start

#f2.close()




以下に対応したもの

  • 名詞-名詞
  • 副詞-動詞連用形
  • 名詞接頭-名詞
  • 動詞連用-動詞連勝
  • 副詞-動詞連用形
  • 形容詞ガル接続-名詞 
  • 形容詞ガル接続-動詞連用形 悔し笑い
  • 形容詞+形容詞,接尾語は未対応.


$ cat noun_series8.rb
>|ruby|
# -*- coding: Utf-8 -*-

require 'MeCab'
require 'set'
require 'csv'


#word_list="food_list0_b.txt"



#word_count_file="buumu_noun_list_count.csv"


#word_count_file="buumu_noun_list_count.csv"
#word_count_file="buumu_word_hist.txt"

#total_text_file='buumu_tot_kanrenbun3_str.csv'

#total_text_file='buumu_tot_kanrenbun3_str.csv'
#output_file_header="buumu_noun_list_count_year_otiho"
total_text_file="buumu_new_2015-04-09_2015-06-21_kanrenbun3_str.csv"


#output_file_header="/home/h.watanabe/buumu_word_hist_out"

output_file_header="test_2_"
#f=open(word_cou)




#food_list=
#id_list=

m=MeCab::Tagger.new()
list1=
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)

list1=
node=mecab.parseToNode(text1)
while node.next != nil do
node=node.next
list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))



end
list1
end


def get_feature_from_text_continuous(text1,mecab)

list1=
node=mecab.parseToNode(text1)
list_continuous_noun=[nil]
hukushi_flg=false

keiyoushi_garu_flg=false
hukushi=""
keiyoushi=""
while node.next != nil do
node=node.next
#word=node.feature.split(",")[6].encode("UTF-8","UTF-8").gsub("\"","")

word=node.surface.encode("UTF-8","UTF-8").gsub("\"","")
morphology=node.feature.split(",")[0].encode("UTF-8","UTF-8")

morphology2=node.feature.split(",")[5].encode("UTF-8","UTF-8")

morphology3=node.feature.split(",")[1].encode("UTF-8","UTF-8")
#morphology2=node.feature
puts morphology2

# puts morphology
#if morphology != "名詞"
#
# list1.push(word)
#
# list1.push(list_continuous_noun.join(""))
# list_continuous_noun=
#else

#
# list_continuous_noun.push(word)

#end



if morphology == "名詞" or (morphology=="動詞" and morphology2=="連用形") or (morphology =="接頭詞" and morphology3=="名詞接続" ) then
if hukushi_flg and (morphology=="動詞" and morphology2=="連用形") then
list_continuous_noun.push(hukushi)
end

if keiyoushi_garu_flg and *1
#list_continuous_noun=
else
if hukushi_flg then

list1.push(word)
list1.push(list_continuous_noun.join(""))
list_continuous_noun=


end

# if morphology !="副詞" then
#
# list1.push(word)
# list1.push(list_continuous_noun.join(""))
# list_continuous_noun=
#
# hukushi_flg=false
# hukushi=""
#
# else

if morphology=="副詞" or (morphology =="形容詞" and morphology2=="ガル接続") then


if morphology=="副詞" then
hukushi_flg=true
hukushi=word
end

if morphology=="形容詞" then

keiyoushi_garu_flg=true
keiyoushi=word

end

else

list1.push(word)
list1.push(list_continuous_noun.join(""))
list_continuous_noun=
hukushi_flg=false

keiyoushi_garu_flg=false
hukushi=""
keiyoushi=""


end




end

#if morphology == "副詞" then

# hukushi_flg=true
# hukushi=word
#else
# hukushi_flg=false
# hukushi=""
#end

end

list1.push(list_continuous_noun.join(""))

#list1.each{|a| puts a.length }
list1.select!{|a| a.length >=1}.delete("*")
list1
end

#text1="塩麹を食べたい"
#list1=get_feature_from_text_continuous(text1,m)
#p list1

#text1="ちょい飲みブーム"

#text1="おごはん"

text1="悔し笑い"
list1=get_feature_from_text_continuous(text1,m)
p list1
exit

#exit

#list1=get_feature_from_text2(text1,m)
#puts list1
# str="うさぎ"




#f2=open('buumu_tot_kanrenbun3_str.csv')


#f2=open('tmp3')
#k=0;
#count=Hash.new{|h,k| h[k]=Hash["2006" => 0, "2007" => 0, "2008" => 0, "2009" => 0, "2010" => 0, "2011" => 0, "2012" =>0 ,"2013"=>0,"2014"=>0,"2015"=>0]}



#####key_list=Set.new()
###CSV.foreach(word_count_file,{:col_sep => "\t"}){|i| key_list.add(i[0]);puts i[0]}


#exit

#year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]

year_list=["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]
f_out=Hash.new()
year_list.each{|j| f_out[j]=open(output_file_header+j+".csv","w")}

f_out2=open(output_file_header+"first"+".csv","w")
f2=open(total_text_file)

count=Hash.new(0)
first=Hash.new(0)
#key_list.each{|k| count[k]=0;first[k]=0 ;}
year_old="-1"
year="2006"
f2.each{|i|
#year_list.each{|year2|
#GC.disable
# if year2 == "first" then
# count=Hash.new(0)
# else

#:w count=Hash.new(0)
# key_list.each{|k| count[k]=0 }

# end

#f_out=open("buumu_noun_list_count_year"+year2+".csv","w")
iii=0
i.chomp!

date=i.split("\t")[0].split(" ")[0]
#year_old=year
year=i.split("\t")[0].split(" ")[0].split("-")[0]
if year_old=="-1" then
puts "koodayo"
year_old=year
else


year_old=year
end

# if year2==year or year2=="first" then

puts year+","+date
text_list=i.split("NNEEWWLLIINNEE")
text_list.each{|j|
j.chomp!
list1=get_feature_from_text_continuous(j,m)
#print list1
list1.each{|k|
#if year2=="first" then
# key_list.add(k)
# count[k]=date
#else

if count[k]==0 then

first[k]=date
end

count[k]=count[k]+1
#end
}

}
iii=iii+1
#puts year2+","+iii.to_s
if iii%1000==0 then


GC.start
#GC.disablea
puts year2+" "+iii.to_s

end
#
# puts i
# puts j
#if iii >= 100 then

# #break
#end
#print count
#exita
#print "test:"+"-"+year_old+"-"+year+"\n";
if year_old != year
count.keys.sort.each{|iiii|
f_out[year_old].puts iiii.to_s+"\t"+count[iiii].to_s
}
count.keys.each{|iiii| count[iiii]=0}
GC.start
end

# }

#f2.close()
# end
}

count.keys.sort.each{|i|
f_out[year].puts i.to_s+"\t"+count[i].to_s

f_out2.puts i.to_s+"\t"+first[i].to_s
}
#count.keys.each{|iiii| count[i]=0}
#GC.start

|

*1:morphology=="動詞" and morphology2=="連用形") or morphology=="名詞") then list_continuous_noun.push(keiyoushi) end list_continuous_noun.push(word) #list1.push(word) #list1.push(list_continuous_noun.join(""