dseg (14237) の日記
2003 年 12 月 24 日
午後 05:44
/.-J→hatena 日記コピースクリプト v0.0.0.9
# --------------------------------------------------------------------
# Title: slash2hatena.pl
# Author: dseg <ds26 at mail.goo.ne.jp>
# URL: http://srad.jp/journal.pl?op=display&uid=14237&id=177265
# Summary:...later...
# Commentary: Definitely a poor hack... Blame me ;)
#
# $Id: slash2hatena.pl,v 0.0.0.9 2004-01-13 07:58:24+09 dseg Exp dseg $
# --------------------------------------------------------------------
=head1 NAME
slash2hatena - copy all journal entries from slashdot.jp to d.hatena.ne.jp
=head1 SYNOPSIS
Usage: slash2hatena.pl [options] <URL>
Required options are:
[-h (hatena) USER:PASS ] your username and password of Hatena
[-s (slashdot) USER or ID] your username or id number of Slashdot-J
Other options are:
[-t <timeout>] set the timeout value(DEFAULT: 30sec)
[-p <proxy-url>] set the proxy-url to be used
[-v] print version number and quit
[--help] print this help message
=head1 AUTHOR
dseg <ds26@mail.goo.ne.jp>
=cut
require 5;
use strict;
# require 'jcode.pl'; # http://srekcah.org/jcode/jcode.pl-2.13
# use Encode qw/from_to/;
use File::Basename qw/fileparse/;
use Getopt::Std;
use HTTP::Request::Common qw/POST/;
use LWP::UserAgent;
use Pod::Usage;
use Time::Local;
# --- Global Variables -----------------------------------------------
use vars qw/$VERSION %opts $h_uid $h_passwd @s_entries/;
# --------------------------------------------------------------------
# --- Init -----------------------------------------------------------
BEGIN {$SIG{INT} = sub{ die "Interrupted!\n" }; $| = 1} # autoflush
($VERSION) = q$Revision: 0.0.0.9 $ =~ /((?:\d\.)+\d+)/;
my $euc2sjis = \&from_to;
# --- Parse command-line ---------------------------------------------
pod2usage(2) if @ARGV == 0 || $ARGV[0] eq '--help';
getopt('h:s:p:t:v', \%opts);
die File::Basename::basename($0), " v${VERSION}\n" if exists $opts{v};
($h_uid, $h_passwd) = split /:/, $opts{h}, 2;
pod2usage(2) unless $h_uid && $h_passwd && $opts{s};
# --- Firstly, login to hatena ---------------------------------------
my $ua = new LWP::UserAgent(
timeout => 30,
agent => &myagent,
cookie_jar => {
# file => 'hatena_cookie.txt',
# autosave => 1,
hide_cookie2 => 1,
},
);
$ua->proxy('http', $opts{p})
if $opts{p} && index($opts{p}, 'http://') == 0;
# --- Determine the user nick of /.-J if id number is present
my ($s_nick, $s_id);
if($opts{s} =~ /^\d+$/) { # id
$s_id = $opts{s};
my $url = "http://srad.jp/users.pl?uid=${s_id}&light=1";
$_ = $ua->get($url);
die "Error: Unable to fetch the page from /.-J\n[url=${url}]\n"
unless $_->is_success;
($s_nick) = $_->content =~ /^($s_id)\s*\((?:\d+)\)/mo;
die "Error: no such userid on /.-J -- [id=$s_id]\n"
unless $s_nick;
} else { # nick
$s_nick = $opts{s};
my $url = "http://srad.jp/users.pl?nick=${s_nick}&light=1";
$_ = $ua->get($url);
die "Error: Unable to fetch the page from /.-J\n[url=${url}]\n"
unless $_->is_success;
($s_id) = $_->content =~ /^(?:$s_nick)\s*\((\d+)\)/mo;
die "Error: no such user on /.-J -- [nick=$s_nick]\n"
unless $s_id;
}
# --- Login succeeded? exit when failed
die "Failed to login to d.hatena.ne.jp\n"
unless hatena_login($ua, $h_uid, $h_passwd);
# --- Read the contents of journals of /.-J --------------------------
my $dir = (fileparse($0))[1]; # Incl. trailing 'path char'
my $datafile = $dir . 'slash_journal.xml';
my $base_url = "http://srad.jp/journal.pl?op=display&uid=${s_id}&id=";
# Read the journal entries from xml file or network, and
# store to the global array - @s_entries.
if(-e $datafile && -r _) {
read_journals_from_xml($datafile);
} else {
my $s_ua = new LWP::UserAgent(
timeout => 30,
agent => &myagent,
);
$s_ua->proxy('http', $opts{p})
if $opts{p} && index($opts{p}, 'http://') == 0;
parse_slash_journal_list($s_ua, $base_url);
my $total_entries = scalar @s_entries;
print "\n--- total number of journal entries: $total_entries\n";
my ($elems, $journal_html, $res, $i);
my $url_opt = '&theme=greypage&light=1';
# modify global array @s_entries in here...
for my $item(@s_entries) {
printf "processing... %d / %d\n", ++$i, $total_entries;
next unless $item;
$res = $ua->get($base_url . $item . $url_opt); # item == each journal id
die "$!\n" unless $res->is_success;
$_ = $res->content;
$elems = &parse_slash_journal
or die "Error occured when trying to fetch the journal entry from /.-J\n";
$item = $elems; # overwrite
sleep 1;
}
}
# serialize to handy .csv file (encoding: ShiftJIS)
# if(@s_entries > 0) {
# my $csv = sprintf '%sjournal-%s_%s.csv', $dir, &now =~ /^(\d{8})(\d{6})$/;
# open(F, "> $csv") or die "$!";
# write_to_csv(\*F, \@s_entries);
# print "\n/.-J journals saved to $csv (charset: ShiftJIS)\n\n"
# if -e $csv;
# }
post_to_hatena($base_url);
exit 0; # ..FIX THIS.. RETURN THE PROPER(SUCCESSED/FAILED) VALUE
# [-- Subroutines ---------------------------------------------------]
sub myagent {
sprintf '%s/%s ', (fileparse($0, qr{\..*}))[0], $main::VERSION;
}
sub now {
sprintf '%04d%02d%02d%02d%02d%02d',
sub{($_[5]+1900, $_[4]+1, $_[3]), $_[2], $_[1], $_[0]}->(localtime);
}
# --- Get journal entries from /.-J ---
sub parse_slash_journal_list {
my ($s_ua, $base_url) = @_;
# Get the list of journals
my $res = $s_ua->get(
"http://srad.jp/journal.pl?op=list&uid=${s_id}&light=1"
);
die $res->status_line . "\n" unless $res->is_success;
@s_entries = map /(\d+)$/,
grep { index($_, 'journal.pl') > 0 }
$res->content =~ /<[Aa] HREF="([^\"]+)"/g;
}
sub parse_slash_journal{
my $elems = {
id => undef,
title => undef,
body => undef,
year => undef,
mon => undef,
mday => undef,
hour => undef,
min => undef,
ampm => undef,
epoc => undef,
comment_cnt => 0,
comment_url => '',
};
my($top, $time_pattern, $end_pattern);
my $url_pattern = "/journal\\.pl\\?op=display&uid=${s_id}&id=(\\d+)";
my $date_pattern = '<[^>]+>(\d\d\d\d)\s?年\s?(\d\d)\s?月\s?(\d\d)\s?日</[^>]+>';
@$elems{qw/comment_cnt comment_url/} = ($1, $2)
if m!(\d+)個の<[Aa] HREF="([^\"]+)">コメント\.\.\.</[Aa]>!;
if(($top = index($_, '<!-- start template: ID 51, greypage;journal;default -->')) >= 0) {
$_ = substr($_, $top, rindex($_, '<!-- end template: ID 51, greypage;journal;default -->'));
$time_pattern = '<[Ii]>([AP]M)\s(\d\d):(\d\d)</[Ii]>';
$end_pattern = '</[pP]>$';
@$elems{qw/year mon mday ampm hour min id title body/} = m{
$date_pattern.*
$time_pattern.*?<a\s+href="$url_pattern">(.*?)</[aA]></FONT></B>\n?
\t*<P>\n?
\t*(.*)
\Z
}sox;
}
# Remove trailing garbage
$elems->{body} =~ s/$end_pattern.*$//mos;
my $hour24 = $elems->{hour};
$hour24 -= 12 if $elems->{ampm} eq 'AM' && $elems->{hour} == 12; # AM 12
$hour24 += 12 if $elems->{ampm} eq 'PM' && $elems->{hour} < 12; # PM 1 to PM 11
$elems->{epoc} = timelocal(
0, $elems->{min}, $hour24, $elems->{mday}, $elems->{mon}-1, $elems->{year}-1900
);
return $elems;
}
# --- Write to a file ------------------------------------------------
sub write_to_csv {
my ($fp, $all) = @_;
my $crlf = "\x0D\x0A";
my ($dt, $lines);
print $fp 'date,title,body,comment,text', $crlf;
for my $e(@$all) {
$dt = join('/', @$e{qw/year mon mday/});
$dt .= ' ' . join(':', @$e{qw/hour min/}) . ' ' . $e->{ampm};
$e->{body} =~ s/\"/\"\"/g;
$lines = join($crlf, "$dt,$e->{title},\"", $e->{body}, '",,') . $crlf;
# print $fp from_to($lines, 'euc-jp', 'shiftjis');
}
}
sub write_to_xml {
my ($fp, $all) = @_;
print $fp <<"XML";
<?xml version="1.0" encoding=""?>
<journal baseurl="http://srad.jp/journal.pl?op=display&uid=${s_id}">
<entries>
XML
print F "</entries>\n</journal>\n";
close F || warn "$!\n";
}
# --- Auto-post journal entries --------------------------------------
sub hatena_login {
my ($ua, $username, $password) = @_;
my $req = POST 'http://d.hatena.ne.jp/login',
[ key => $username,
password => $password,
submit => 'ログイン',
];
my $res = $ua->request($req);
if($res->is_redirect) {
$ua->cookie_jar->extract_cookies($res);
$ua->cookie_jar->save('hatena_cookie.txt');
# $url = $res->header('Location');
} else {
warn "Warning: $res->status_line\n";
return 0;
}
1;
}
sub post_to_hatena {
my $base_url = shift;
my $h_post_url = "http://d.hatena.ne.jp/${h_uid}/edit";
my ($req2, $res);
my $sleep_cnt = 0;
use constant MAX_RETRY => 3;
for my $entry(@s_entries) {
$req2 = POST $h_post_url,
[ mode => 'enter',
timestamp => '',
year => $entry->{year},
month => $entry->{mon},
day => $entry->{mday},
# date => '20031217',
title => $entry->{title},
body => generate_hatena_body($entry),
edit => 'この内容を登録する',
trivial => 0,
image => '',
imagetitle => '',
];
# Reset 'referrer' string
$req2->header('Referrer' => '');
# Attach a cookie
$ua->cookie_jar->add_cookie_header($req2);
# print $req2->as_string, "\n"; exit; # for debug
MIRRORING: while(1) {
$res = $ua->request($req2);
if($res->is_success){
print "Entry successfully added.\n";
warn 'ok: ', $res->as_string;
last;
} elsif($res->is_redirect) {
warn 'ok, redirect: ', $res->header('Location'), "\n";
last;
} elsif($res->is_error) {
if($sleep_cnt++ < MAX_RETRY && $res->status_line =~ /timeout/i) {
warn "Server returned error code. Retrying after 5 secs...\n";
sleep 5;
next MIRRORING;
}
my $ent_dt = localtime($entry->{epoc});
die $res->status_line, "\n",
"Error: An error occurred when trying to post this entry\n",
"-- [$ent_dt]\n",
} else {
die "Something strange happened.\n";
}
}
sleep 3; # wait few seconds until next post
}
}
sub generate_hatena_body {
my $e = shift;
$e->{body} =~ tr/\012\015//d;
$e->{body} =~ s!<br\s*/?>!\012!ig; #{{{
$_ = <<BODY;
*$e->{epoc}*[<a href="${base_url}$e->{id}">$s_nickの日記</a> ($s_id)] ($e->{title})
$e->{body}
BODY
#}}}
return $e->{comment_cnt} == 0 ?
$_ :
$_ . <<BODY_FOOTER;
[コメント... <a href="$e->{comment_url}">[srad.jp]</a>]
BODY_FOOTER
}
sub read_journals_from_xml {
1; # dummy
}
# --- EOF ------------------------------------------------------------
この議論は賞味期限が切れたので、アーカイブ化されています。
新たにコメントを付けることはできません。
メリークリスマスでした。 (スコア:1)
開発・改良を続けてくださっていることに、敬意と感謝を示します。特に、出典とコメントを採用していただけたのには、もううれしくてうれしくて。とてもありがたいことです。
ところで、1点だけ。AM12:00の扱いですけれど、転載先で午後の12時の扱いになってしまうようです。これは、AM0:00と表記しないスラドが悪いんですけれども…
Re:メリークリスマスでした。 (スコア:1)
プログラムの寿命は、
利用してくれる人がいてこそ長くなるものなので、
使ってもらえて嬉しいです :)
ところで、AM12:00の件はちょこっと修正して、表のエントリを
たった今ひっそりとアップデートしました。
まだ変更をかけたソースを実際に動作させてませんので、
これで大丈夫かどうかは判りません。
何か問題があったら、またここに書き込みます~