- クロール
get.sh
for i in `seq 1 2013`; do echo $i wget http://ja.wikipedia.org/wiki/Category:$i%E5%B9%B4%E6%B2%A1 sleep 20 done
sh get.sh
- データ整理
out.sh
rm tmp2 for i in `ls | grep Cate | sort -g`;do cat $i | grep title= | grep li | grep ul | grep wiki | grep -v catlinks | awk -F"</a>" '{print $1}' | sed "s/.*\">//g" | sed "s/(.*).*//g" > tmp file=`echo $i | sed "s/.*3A//g;s/年没.*//"` sed "s/.*/$file/g" tmp > tmp1 paste -d"," tmp1 tmp >> tmp2 paste -d"," tmp1 tmp done sort -k1,1g tmp2 | grep -v 訃報 > hito.txt
sh out.sh