以下でなく連続ワード対応版を使う::
(1)node.feature.split(",")[6].encode("UTF-8","UTF-8") -> node.surfece
featureだと未知語がでなくなる問題がある。
(2)複合語版を使うべき。
http://d.hatena.ne.jp/arupaka-_-arupaka/20150511/1431327544
# -*- coding: Utf-8 -*- require 'MeCab' f=open("food_list0_b.txt") food_list=[] id_list=[] m=MeCab::Tagger.new() list1=[] text1="おはよう。うさぎさん, うさぎ" def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8")) end list1 end list1=get_feature_from_text(text1,m) puts list1 str="うさぎ" def adjacent_word_from_list(list1,focus_word,slide=1) focus_word=focus_word.encode("UTF-8","UTF-8") list1.delete("*") list1.delete("NNEEWWLLIINNEE") i=list1.index(focus_word) # slide=6 if i!=nil then v=list1[([0,(i-slide)].max())..[(i+slide)].min()] else v=nil end v end text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,m) v=adjacent_word_from_list(list1,"うさぎ",3) print v v=adjacent_word_from_list(list1,"ほんこん",3) print v
直接テキストから
def adjacent_word_from_text(text1,focus_word,slide=1,mecab) list1=get_feature_from_text(text1,mecab) v=adjacent_word_from_list(list1,focus_word,slide) v end
リストの単語を網羅的に
前後100語slide=100
# -*- coding: Utf-8 -*- require 'MeCab' f=open("food_list0_b.txt") food_list=[] id_list=[] m=MeCab::Tagger.new() list1=[] text1="おはよう。うさぎさん, うさぎ" def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8")) end list1 end list1=get_feature_from_text(text1,m) puts list1 str="うさぎ" def adjacent_word_from_list(list1,focus_word,slide=1) focus_word=focus_word.encode("UTF-8","UTF-8") list1.delete("*") list1.delete("NNEEWWLLIINNEE") i=list1.index(focus_word) # slide=6 if i!=nil then v=list1[([0,(i-slide)].max())..[(i+slide)].min()] else v=nil end v end def adjacent_word_from_list2(list1,focus_word,slide=1) focus_word=focus_word.encode("UTF-8","UTF-8") list1.delete("*") list1.delete("NNEEWWLLIINNEE") i=list1.index(focus_word) # slide=6 if i!=nil then v1=list1[([0,(i-slide)].max())..[0,(i-1)].max()] else v1=nil end if i!=nil then v2=list1[([i+1,list1.length-1].min())..[(i+slide),list1.length-1].min()] else v2=nil end v=[v1,v2] v end text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,m) v=adjacent_word_from_list(list1,"うさぎ",3) text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,m) v=adjacent_word_from_list(list1,"うさぎ",3) print v v=adjacent_word_from_list(list1,"ほんこん",3) print v def adjacent_word_from_text(text1,focus_word,slide=1,mecab) # text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,mecab) v=adjacent_word_from_list(list1,focus_word,slide) v end def adjacent_word_from_text2(text1,focus_word,slide=1,mecab) # text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,mecab) v=adjacent_word_from_list2(list1,focus_word,slide) v end #exit #exit f.each{|i| i.chomp! j=i.split(",") id_list.push(j[0]) ; food_list.push(j[1]) ; } #food_list.each{|i| puts i} #f2=open('buumu_kanrenbun3_str.csv') f2=open('buumu_tot_kanrenbun3_str.csv') k=0; count=Hash.new(0) #f_out=open("buumu_count.csv","w") f_out=open("buumu_count_word.csv","w") slide=100 f2.each{|i| i.chomp! #puts i food_list.length.times.each{|j| if i.include?(food_list[j]) then # puts food_list[j] # puts i adjacent_list=adjacent_word_from_text2(i,food_list[j],slide,m) if adjacent_list[0] == nil adjacent_list[0]=[""] end if adjacent_list[1] == nil adjacent_list[1]=[""] end #count[i]=count[i]+1 f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list[0].join(":")+","+adjacent_list[1].join(":") # f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list.join(":") end } k=k+1 puts i.split("\t")[0] }