パスワードを忘れた? アカウント作成
452788 journal

dsegの日記: /.-J→hatena 日記コピースクリプト v0.0.0.9 2

日記 by dseg
#!/usr/bin/perl -w # [for (x)emacs] -*- coding: euc-jp -*-
# --------------------------------------------------------------------
# Title:  slash2hatena.pl
# Author: dseg <ds26 at mail.goo.ne.jp>
# URL:    http://srad.jp/journal.pl?op=display&uid=14237&id=177265
# Summary:...later...
# Commentary: Definitely a poor hack... Blame me ;)
#
# $Id: slash2hatena.pl,v 0.0.0.9 2004-01-13 07:58:24+09 dseg Exp dseg $
# --------------------------------------------------------------------

=head1 NAME

slash2hatena - copy all journal entries from slashdot.jp to d.hatena.ne.jp

=head1 SYNOPSIS

Usage: slash2hatena.pl [options] <URL>

Required options are:

    [-h (hatena)   USER:PASS ] your username and password of Hatena
    [-s (slashdot) USER or ID] your username or id number of Slashdot-J

Other options are:

    [-t <timeout>]           set the timeout value(DEFAULT: 30sec)
    [-p <proxy-url>]         set the proxy-url to be used
    [-v]                     print version number and quit
    [--help]                 print this help message

=head1 AUTHOR

dseg <ds26@mail.goo.ne.jp>

=cut

require 5;
use strict;

# require 'jcode.pl'; # http://srekcah.org/jcode/jcode.pl-2.13
# use Encode qw/from_to/;
use File::Basename qw/fileparse/;
use Getopt::Std;
use HTTP::Request::Common qw/POST/;
use LWP::UserAgent;
use Pod::Usage;
use Time::Local;

# --- Global Variables -----------------------------------------------
use vars qw/$VERSION %opts $h_uid $h_passwd @s_entries/;
# --------------------------------------------------------------------

# --- Init -----------------------------------------------------------
BEGIN {$SIG{INT} = sub{ die "Interrupted!\n" }; $| = 1} # autoflush
($VERSION) = q$Revision: 0.0.0.9 $ =~ /((?:\d\.)+\d+)/;
my $euc2sjis = \&from_to;

# --- Parse command-line ---------------------------------------------
pod2usage(2) if @ARGV == 0 || $ARGV[0] eq '--help';
getopt('h:s:p:t:v', \%opts);
die File::Basename::basename($0), " v${VERSION}\n" if exists $opts{v};
($h_uid, $h_passwd) = split /:/, $opts{h}, 2;
pod2usage(2) unless $h_uid && $h_passwd && $opts{s};

# --- Firstly, login to hatena ---------------------------------------
my $ua = new LWP::UserAgent(
  timeout    => 30,
  agent      => &myagent,
  cookie_jar => {
#    file         => 'hatena_cookie.txt',
#    autosave     => 1,
    hide_cookie2 => 1,
  },
);
$ua->proxy('http', $opts{p})
  if $opts{p} && index($opts{p}, 'http://') == 0;

# --- Determine the user nick of /.-J if id number is present
my ($s_nick, $s_id);
if($opts{s} =~ /^\d+$/) { # id
  $s_id = $opts{s};
  my $url = "http://srad.jp/users.pl?uid=${s_id}&light=1";
  $_ = $ua->get($url);
  die "Error: Unable to fetch the page from /.-J\n[url=${url}]\n"
    unless $_->is_success;
  ($s_nick) = $_->content =~ /^($s_id)\s*\((?:\d+)\)/mo;
  die "Error: no such userid on /.-J -- [id=$s_id]\n"
    unless $s_nick;
} else { # nick
  $s_nick = $opts{s};
  my $url = "http://srad.jp/users.pl?nick=${s_nick}&light=1";
  $_ = $ua->get($url);
  die "Error: Unable to fetch the page from /.-J\n[url=${url}]\n"
    unless $_->is_success;
  ($s_id) = $_->content =~ /^(?:$s_nick)\s*\((\d+)\)/mo;
  die "Error: no such user on /.-J -- [nick=$s_nick]\n"
    unless $s_id;
}

# --- Login succeeded? exit when failed
die "Failed to login to d.hatena.ne.jp\n"
  unless hatena_login($ua, $h_uid, $h_passwd);

# --- Read the contents of journals of /.-J --------------------------
my $dir = (fileparse($0))[1]; # Incl. trailing 'path char'
my $datafile = $dir . 'slash_journal.xml';
my $base_url = "http://srad.jp/journal.pl?op=display&uid=${s_id}&id=";

# Read the journal entries from xml file or network, and
# store to the global array - @s_entries.

if(-e $datafile && -r _) {
  read_journals_from_xml($datafile);
} else {
  my $s_ua = new LWP::UserAgent(
    timeout    => 30,
    agent      => &myagent,
  );
  $s_ua->proxy('http', $opts{p})
    if $opts{p} && index($opts{p}, 'http://') == 0;

  parse_slash_journal_list($s_ua, $base_url);

  my $total_entries = scalar @s_entries;
  print "\n--- total number of journal entries: $total_entries\n";
  my ($elems, $journal_html, $res, $i);
  my $url_opt = '&theme=greypage&light=1';

  # modify global array @s_entries in here...
  for my $item(@s_entries) {
    printf "processing... %d / %d\n", ++$i, $total_entries;
    next unless $item;
    $res = $ua->get($base_url . $item . $url_opt); # item == each journal id
    die "$!\n" unless $res->is_success;
    $_ = $res->content;
    $elems = &parse_slash_journal
      or die "Error occured when trying to fetch the journal entry from /.-J\n";
    $item = $elems; # overwrite
    sleep 1;
  }
}
# serialize to handy .csv file (encoding: ShiftJIS)
# if(@s_entries > 0) {
#  my $csv = sprintf '%sjournal-%s_%s.csv', $dir, &now =~ /^(\d{8})(\d{6})$/;
#  open(F, "> $csv") or die "$!";
# write_to_csv(\*F, \@s_entries);
#  print "\n/.-J journals saved to $csv (charset: ShiftJIS)\n\n"
#    if -e $csv;
# }
post_to_hatena($base_url);
exit 0; # ..FIX THIS.. RETURN THE PROPER(SUCCESSED/FAILED) VALUE

# [-- Subroutines ---------------------------------------------------]
sub myagent {
  sprintf '%s/%s ', (fileparse($0, qr{\..*}))[0], $main::VERSION;
}

sub now {
  sprintf '%04d%02d%02d%02d%02d%02d',
    sub{($_[5]+1900, $_[4]+1, $_[3]), $_[2], $_[1], $_[0]}->(localtime);
}

# --- Get journal entries from /.-J ---
sub parse_slash_journal_list {
  my ($s_ua, $base_url) = @_;
  # Get the list of journals
  my $res = $s_ua->get(
    "http://srad.jp/journal.pl?op=list&uid=${s_id}&light=1"
  );
  die $res->status_line . "\n" unless $res->is_success;

  @s_entries = map /(\d+)$/,
    grep { index($_, 'journal.pl') > 0 }
    $res->content =~ /<[Aa] HREF="([^\"]+)"/g;
}

sub parse_slash_journal{
  my $elems = {
    id          => undef,
    title       => undef,
    body        => undef,
    year        => undef,
    mon         => undef,
    mday        => undef,
    hour        => undef,
    min         => undef,
    ampm        => undef,
    epoc        => undef,
    comment_cnt => 0,
    comment_url => '',
  };
  my($top, $time_pattern, $end_pattern);
  my $url_pattern = "/journal\\.pl\\?op=display&uid=${s_id}&id=(\\d+)";
  my $date_pattern = '<[^>]+>(\d\d\d\d)\s?年\s?(\d\d)\s?月\s?(\d\d)\s?日</[^>]+>';

  @$elems{qw/comment_cnt comment_url/} = ($1, $2)
    if m!(\d+)個の<[Aa] HREF="([^\"]+)">コメント\.\.\.</[Aa]>!;

  if(($top = index($_, '<!-- start template: ID 51, greypage;journal;default -->')) >= 0) {
    $_ = substr($_, $top, rindex($_, '<!-- end template: ID 51, greypage;journal;default -->'));

    $time_pattern = '<[Ii]>([AP]M)\s(\d\d):(\d\d)</[Ii]>';
    $end_pattern = '</[pP]>$';

    @$elems{qw/year mon mday ampm hour min id title body/} = m{
      $date_pattern.*
      $time_pattern.*?<a\s+href="$url_pattern">(.*?)</[aA]></FONT></B>\n?
      \t*<P>\n?
      \t*(.*)
      \Z
    }sox;
  }
  # Remove trailing garbage
  $elems->{body} =~ s/$end_pattern.*$//mos;

  my $hour24 = $elems->{hour};
  $hour24 -= 12 if $elems->{ampm} eq 'AM' && $elems->{hour} == 12; # AM 12
  $hour24 += 12 if $elems->{ampm} eq 'PM' && $elems->{hour}  < 12; # PM 1 to PM 11
  $elems->{epoc} = timelocal(
    0, $elems->{min}, $hour24, $elems->{mday}, $elems->{mon}-1, $elems->{year}-1900
  );
  return $elems;
}

# --- Write to a file ------------------------------------------------
sub write_to_csv {
  my ($fp, $all) = @_;
  my $crlf = "\x0D\x0A";
  my ($dt, $lines);

  print $fp 'date,title,body,comment,text', $crlf;
  for my $e(@$all) {
    $dt = join('/', @$e{qw/year mon mday/});
    $dt .= ' ' . join(':', @$e{qw/hour min/}) . ' ' . $e->{ampm};
    $e->{body} =~ s/\"/\"\"/g;
    $lines = join($crlf, "$dt,$e->{title},\"", $e->{body}, '",,') . $crlf;
#    print $fp from_to($lines, 'euc-jp', 'shiftjis');
  }
}

sub write_to_xml {
  my ($fp, $all) = @_;

  print $fp <<"XML";
<?xml version="1.0" encoding=""?>
<journal baseurl="http://srad.jp/journal.pl?op=display&uid=${s_id}">
<entries>
XML
  print F "</entries>\n</journal>\n";
  close F || warn "$!\n";
}

# --- Auto-post journal entries --------------------------------------
sub hatena_login {
  my ($ua, $username, $password) = @_;
  my $req = POST 'http://d.hatena.ne.jp/login',
    [ key      => $username,
      password => $password,
      submit   => 'ログイン',
    ];
  my $res = $ua->request($req);

  if($res->is_redirect) {
    $ua->cookie_jar->extract_cookies($res);
    $ua->cookie_jar->save('hatena_cookie.txt');
    # $url = $res->header('Location');
  } else {
    warn "Warning: $res->status_line\n";
    return 0;
  }
  1;
}

sub post_to_hatena {
  my $base_url = shift;
  my $h_post_url = "http://d.hatena.ne.jp/${h_uid}/edit";
  my ($req2, $res);
  my $sleep_cnt = 0;
  use constant MAX_RETRY => 3;

  for my $entry(@s_entries) {
    $req2  = POST $h_post_url,
      [ mode       => 'enter',
    timestamp  => '',
    year       => $entry->{year},
    month      => $entry->{mon},
    day        => $entry->{mday},
    # date     => '20031217',
    title      => $entry->{title},
    body       => generate_hatena_body($entry),
    edit       => 'この内容を登録する',
    trivial    => 0,
    image      => '',
    imagetitle => '',
      ];
    # Reset 'referrer' string
    $req2->header('Referrer' => '');
    # Attach a cookie
    $ua->cookie_jar->add_cookie_header($req2);
#    print $req2->as_string, "\n"; exit; # for debug

    MIRRORING: while(1) {
      $res = $ua->request($req2);
      if($res->is_success){
    print "Entry successfully added.\n";
    warn 'ok: ', $res->as_string;
    last;

      } elsif($res->is_redirect) {
    warn 'ok, redirect: ', $res->header('Location'), "\n";
    last;

      } elsif($res->is_error) {
    if($sleep_cnt++ < MAX_RETRY && $res->status_line =~ /timeout/i) {
      warn "Server returned error code. Retrying after 5 secs...\n";
      sleep 5;
      next MIRRORING;
    }
    my $ent_dt = localtime($entry->{epoc});
    die $res->status_line, "\n",
      "Error: An error occurred when trying to post this entry\n",
      "-- [$ent_dt]\n",

      } else {
    die "Something strange happened.\n";
      }
    }
    sleep 3; # wait few seconds until next post
  }
}

sub generate_hatena_body {
  my $e = shift;
  $e->{body} =~ tr/\012\015//d;
  $e->{body} =~ s!<br\s*/?>!\012!ig; #{{{
$_ = <<BODY;
*$e->{epoc}*[<a href="${base_url}$e->{id}">$s_nickの日記</a> ($s_id)]  ($e->{title})
$e->{body}
BODY
#}}}
  return $e->{comment_cnt} == 0 ?
    $_ :
    $_ . <<BODY_FOOTER;
[コメント... <a href="$e->{comment_url}">[srad.jp]</a>]
BODY_FOOTER
}

sub read_journals_from_xml {
  1; # dummy
}
# --- EOF ------------------------------------------------------------
この議論は賞味期限が切れたので、アーカイブ化されています。 新たにコメントを付けることはできません。
  • すいません、昨晩のうちに返信しなきゃと思っていたのですが、頭が回らなくなってしまって。

    開発・改良を続けてくださっていることに、敬意と感謝を示します。特に、出典とコメントを採用していただけたのには、もううれしくてうれしくて。とてもありがたいことです。

    ところで、1点だけ。AM12:00の扱いですけれど、転載先で午後の12時の扱いになってしまうようです。これは、AM0:00と表記しないスラドが悪いんですけれども…
    • どうもー。
      プログラムの寿命は、
      利用してくれる人がいてこそ長くなるものなので、
      使ってもらえて嬉しいです :)
      ところで、AM12:00の件はちょこっと修正して、表のエントリを
      たった今ひっそりとアップデートしました。

      まだ変更をかけたソースを実際に動作させてませんので、
      これで大丈夫かどうかは判りません。
      何か問題があったら、またここに書き込みます~
      親コメント
typodupeerror

私は悩みをリストアップし始めたが、そのあまりの長さにいやけがさし、何も考えないことにした。-- Robert C. Pike

読み込み中...