mecabとrubyで形態素解析して文章を単語に分割してリストにつめる[基本]
# -*- coding: Utf-8 -*- require 'MeCab' def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].eccode("UTF-8","UTF-8")) end list1 end m=MeCab::Tagger.new() list1=[] text1="うさぎさん。こんにちは。はい、うさぎです。" list1=get_feature_from_text(text1,m) puts list1
リストの単語を網羅的に調べる.
# -*- coding: Utf-8 -*- require 'MeCab' f=open("food_list0_b.txt") food_list=[] id_list=[] m=MeCab::Tagger.new() list1=[] text1="おはよう。うさぎさん, うさぎ" def get_feature_from_text(text1,mecab) list1=[] node=mecab.parseToNode(text1) while node.next != nil do node=node.next list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8")) end list1 end list1=get_feature_from_text(text1,m) puts list1 str="うさぎ" def adjacent_word_from_list(list1,focus_word,slide=1) focus_word=focus_word.encode("UTF-8","UTF-8") list1.delete("*") list1.delete("NNEEWWLLIINNEE") i=list1.index(focus_word) # slide=6 if i!=nil then v=list1[([0,(i-slide)].max())..[(i+slide)].min()] else v=nil end v end def adjacent_word_from_list2(list1,focus_word,slide=1) focus_word=focus_word.encode("UTF-8","UTF-8") list1.delete("*") list1.delete("NNEEWWLLIINNEE") i=list1.index(focus_word) # slide=6 if i!=nil then v1=list1[([0,(i-slide)].max())..[0,(i-1)].max()] else v1=nil end if i!=nil then v2=list1[([i+1,list1.length-1].min())..[(i+slide),list1.length-1].min()] else v2=nil end v=[v1,v2] v end text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,m) v=adjacent_word_from_list(list1,"うさぎ",3) text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,m) v=adjacent_word_from_list(list1,"うさぎ",3) print v v=adjacent_word_from_list(list1,"ほんこん",3) print v def adjacent_word_from_text(text1,focus_word,slide=1,mecab) # text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,mecab) v=adjacent_word_from_list(list1,focus_word,slide) v end def adjacent_word_from_text2(text1,focus_word,slide=1,mecab) # text1="おはよう。うさぎさん, うさぎ" list1=get_feature_from_text(text1,mecab) v=adjacent_word_from_list2(list1,focus_word,slide) v end #exit #exit f.each{|i| i.chomp! j=i.split(",") id_list.push(j[0]) ; food_list.push(j[1]) ; } #food_list.each{|i| puts i} #f2=open('buumu_kanrenbun3_str.csv') f2=open('buumu_tot_kanrenbun3_str.csv') k=0; count=Hash.new(0) #f_out=open("buumu_count.csv","w") f_out=open("buumu_count_word.csv","w") slide=100 f2.each{|i| i.chomp! #puts i food_list.length.times.each{|j| if i.include?(food_list[j]) then # puts food_list[j] # puts i adjacent_list=adjacent_word_from_text2(i,food_list[j],slide,m) if adjacent_list[0] == nil adjacent_list[0]=[""] end if adjacent_list[1] == nil adjacent_list[1]=[""] end #count[i]=count[i]+1 f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list[0].join(":")+","+adjacent_list[1].join(":") # f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list.join(":") end } k=k+1 puts i.split("\t")[0] }