Silphireの日記: search.plからタイトル部分を抜き出す
日記 by
Silphire
# getindexで使う正規表現群
START_COMMENT = /.*<!-- start template: ID 45, storysearch;search;default -->/
END_COMMENT = /<!-- end template: ID 45, storysearch;search;default -->.*/
TITLE_PATTERN = /(\d+) <A HREF="(.*?)&.*?">(.*?)<\/A> by (.*?) with (\d+) comments <FONT SIZE="2">on (.*?)<\/FONT><BR>/
# getindex
# search.plからタイトルとURLを取得
def getindex
info = []
# HTML取得
s = gethtml('slashdot.jp', '/search.pl')
# 前と後の余分な物を削除
s.gsub!(START_COMMENT, '')
s.gsub!(END_COMMENT, '')
# 情報の部分を抜き出す
s.scan(TITLE_PATTERN) do |t|
h = {}
h['number'] = t[0].to_i
h['url'] = t[1]
h['title'] = t[2]
h['author'] = t[3]
h['comment'] = t[4].to_i
h['time'] = t[5]
info << h
end
info
end
getindex.each do |h|
print "#{h['number']} #{h['title'].tosjis} <#{h['url']}> by #{h['author']} with #{h['comment']} comments on #{h['time']}\n"
end
START_COMMENT = /.*<!-- start template: ID 45, storysearch;search;default -->/
END_COMMENT = /<!-- end template: ID 45, storysearch;search;default -->.*/
TITLE_PATTERN = /(\d+) <A HREF="(.*?)&.*?">(.*?)<\/A> by (.*?) with (\d+) comments <FONT SIZE="2">on (.*?)<\/FONT><BR>/
# getindex
# search.plからタイトルとURLを取得
def getindex
info = []
# HTML取得
s = gethtml('slashdot.jp', '/search.pl')
# 前と後の余分な物を削除
s.gsub!(START_COMMENT, '')
s.gsub!(END_COMMENT, '')
# 情報の部分を抜き出す
s.scan(TITLE_PATTERN) do |t|
h = {}
h['number'] = t[0].to_i
h['url'] = t[1]
h['title'] = t[2]
h['author'] = t[3]
h['comment'] = t[4].to_i
h['time'] = t[5]
info << h
end
info
end
getindex.each do |h|
print "#{h['number']} #{h['title'].tosjis} <#{h['url']}> by #{h['author']} with #{h['comment']} comments on #{h['time']}\n"
end
search.plからタイトル部分を抜き出す More ログイン