mecabとrubyである単語の前後にある単語を取得

以下でなく連続ワード対応版を使う::
(1)node.feature.split(",")[6].encode("UTF-8","UTF-8") -> node.surfece
featureだと未知語がでなくなる問題がある。
(2)複合語版を使うべき。
http://d.hatena.ne.jp/arupaka-_-arupaka/20150511/1431327544

mecabrubyである単語の前後にある単語を取得

# -*- coding: Utf-8 -*-

require 'MeCab'


f=open("food_list0_b.txt")

food_list=[]
id_list=[]

m=MeCab::Tagger.new()
list1=[]
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        while node.next != nil do
                node=node.next
                list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))


        end
        list1
end

list1=get_feature_from_text(text1,m)
puts list1
        str="うさぎ"

def adjacent_word_from_list(list1,focus_word,slide=1)
        focus_word=focus_word.encode("UTF-8","UTF-8")
        list1.delete("*")
        list1.delete("NNEEWWLLIINNEE")
        i=list1.index(focus_word)
#       slide=6
        if i!=nil then
                v=list1[([0,(i-slide)].max())..[(i+slide)].min()]
        else
                v=nil
        end
        v

end


text1="おはよう。うさぎさん, うさぎ"
list1=get_feature_from_text(text1,m)
v=adjacent_word_from_list(list1,"うさぎ",3)
print v

v=adjacent_word_from_list(list1,"ほんこん",3)
print v

直接テキストから

  def adjacent_word_from_text(text1,focus_word,slide=1,mecab)

          list1=get_feature_from_text(text1,mecab)
          v=adjacent_word_from_list(list1,focus_word,slide)
          v
 
  end


リストの単語を網羅的に
前後100語slide=100

# -*- coding: Utf-8 -*-

require 'MeCab'


f=open("food_list0_b.txt")

food_list=[]
id_list=[]

m=MeCab::Tagger.new()
list1=[]
text1="おはよう。うさぎさん, うさぎ"
def get_feature_from_text(text1,mecab)

        list1=[]
        node=mecab.parseToNode(text1)
        while node.next != nil do
                node=node.next
                list1.push(node.feature.split(",")[6].encode("UTF-8","UTF-8"))


        end
        list1
end

list1=get_feature_from_text(text1,m)
puts list1
        str="うさぎ"

def adjacent_word_from_list(list1,focus_word,slide=1)
        focus_word=focus_word.encode("UTF-8","UTF-8")
        list1.delete("*")
        list1.delete("NNEEWWLLIINNEE")
        i=list1.index(focus_word)
#       slide=6
        if i!=nil then
                v=list1[([0,(i-slide)].max())..[(i+slide)].min()]
        else
                v=nil
        end
        v

end


def adjacent_word_from_list2(list1,focus_word,slide=1)
        focus_word=focus_word.encode("UTF-8","UTF-8")
        list1.delete("*")
        list1.delete("NNEEWWLLIINNEE")
        i=list1.index(focus_word)
#       slide=6
        if i!=nil then
                v1=list1[([0,(i-slide)].max())..[0,(i-1)].max()]
        else
                v1=nil
        end


        if i!=nil then
                v2=list1[([i+1,list1.length-1].min())..[(i+slide),list1.length-1].min()]
        else
                v2=nil
        end
        v=[v1,v2]
        v

end


text1="おはよう。うさぎさん, うさぎ"
list1=get_feature_from_text(text1,m)
v=adjacent_word_from_list(list1,"うさぎ",3)



text1="おはよう。うさぎさん, うさぎ"
list1=get_feature_from_text(text1,m)
v=adjacent_word_from_list(list1,"うさぎ",3)
print v

v=adjacent_word_from_list(list1,"ほんこん",3)
print v

def adjacent_word_from_text(text1,focus_word,slide=1,mecab)
#       text1="おはよう。うさぎさん, うさぎ"
        list1=get_feature_from_text(text1,mecab)
        v=adjacent_word_from_list(list1,focus_word,slide)
        v

end

def adjacent_word_from_text2(text1,focus_word,slide=1,mecab)
#       text1="おはよう。うさぎさん, うさぎ"
        list1=get_feature_from_text(text1,mecab)
        v=adjacent_word_from_list2(list1,focus_word,slide)
        v
end
#exit
#exit


f.each{|i|
        i.chomp!
        j=i.split(",")
        id_list.push(j[0]) ;

        food_list.push(j[1]) ;

}



#food_list.each{|i| puts i}


#f2=open('buumu_kanrenbun3_str.csv')

f2=open('buumu_tot_kanrenbun3_str.csv')
k=0;
count=Hash.new(0)
#f_out=open("buumu_count.csv","w")

f_out=open("buumu_count_word.csv","w")
slide=100
f2.each{|i|
        i.chomp!
        #puts i
        food_list.length.times.each{|j|
                if i.include?(food_list[j]) then
                #       puts food_list[j]
                #       puts i
                         adjacent_list=adjacent_word_from_text2(i,food_list[j],slide,m)
                         if adjacent_list[0] == nil

                                adjacent_list[0]=[""]
                         end

                         if adjacent_list[1] == nil

                                adjacent_list[1]=[""]
                         end

                        #count[i]=count[i]+1

                                f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list[0].join(":")+","+adjacent_list[1].join(":")
                        #       f_out.puts id_list[j]+","+food_list[j]+","+i.split("\t")[0].split(" ")[0]+","+adjacent_list.join(":")
                end
        }
        k=k+1
        puts i.split("\t")[0]
}