משתמש:Erel Segal/סקריפט להעלאת דפים
מראה
# # u2w = Upload To Wikitext # # SYNTAX: # perl u2w.pl <input-file> # my $in_file = shift || ""; my $summary = "עדכון אוטומטי"; my $username = "<username>"; my $password = "<password>"; # INPUT: the current page content. # OUTPUT: the new page content. # NOTE: To use this function, create a file with entries such as: # %%%%% title1 # %%%%% title2 # %%%%% title3 # etc... sub edit_function { my $content = shift; my $cat0, my $cat1; $cat0 = "תוכחה, ביקורת"; $cat1 = "ביקורת ותוכחה"; $content =~ s!קטגוריה\s*:\s*$cat0!קטגוריה:$cat1!g; $content =~ s!({{כותרת עליונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg; $content =~ s!({{כותרת תחתונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg; #$content =~ s!^({{קיצור דרך[^{}]+}})$!<noinclude>$1</noinclude>!mg; $content =~ s!(</?noinclude>)</?noinclude>!$1!g; return $content; } use htmlspecialchars; use Hebrew_utf8; use TNK_utf8; package main; use LWP::Simple; use strict; use warnings; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTTP::Cookies; # use WWW::Mechanize; doesn't work: HTTP/1.0 403 Forbidden use IO::Handle; $main::TARGET_URL = "http://he.wikisource.org/w"; #print `iconv -f windows-1255 -t utf-8 $in_file > $in_file.utf8`; $in_file = "$in_file.utf8"; $main::REWRITE_EXISTING_PAGES = (0 || $in_file =~ /existing/); (my $file_for_pages_that_already_exist = $in_file) =~ s/[.]txt/.existing.txt/; (my $file_for_pages_whose_upload_failed = $in_file) =~ s/[.]txt/.failed.txt/; (my $file_for_debug = $in_file) =~ s/[.]txt/.debug.txt/; my $count_pages = 0; my $browser=LWP::UserAgent->new(); # WWW::Mechanize->new(); my @ns_headers = ( 'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); $browser->cookie_jar( {} ); push @{$browser->requests_redirectable}, 'POST'; my $response; $response=$browser->post("$main::TARGET_URL/index.php?title=%D7%9E%D7%99%D7%95%D7%97%D7%93:Userlogin&action=submitlogin&type=login&returnto=%D7%A2%D7%9E%D7%95%D7%93_%D7%A8%D7%90%D7%A9%D7%99", @ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginattempt=>"%D7%9B%D7%A0%D7%99%D7%A1%D7%94+%D7%9C%D7%97%D7%A9%D7%91%D7%95%D7%9F"]); open (DEBUG,">$file_for_debug") or die "Could not write to debug file $file_for_debug.\n"; ############# upload: ############# my @responses; open(EXISTING_FILE, ">$file_for_pages_that_already_exist") unless($main::REWRITE_EXISTING_PAGES); autoflush EXISTING_FILE 1; open(FAILED_FILE, ">$file_for_pages_whose_upload_failed"); autoflush FAILED_FILE 1; my $wpTextbox1=''; my $name_of_page=''; print "uploading to $main::TARGET_URL\n"; open(IN_F, $in_file) or die "Cannot open $in_file as input\n"; while (<IN_F>) { my $line = $_; if ($line =~ /^#####(.*)$/) { $name_of_page = $1; $name_of_page =~ s/^\s*//g; $name_of_page =~ s/\s*$//g; next; } elsif ($line =~ /^%%%%%(.*)/) { $name_of_page = $1; $name_of_page =~ s/^\s*//g; $name_of_page =~ s/\s*$//g; print "Changing $name_of_page "; upload_file($name_of_page, "%%%%%"); print DEBUG "changed $name_of_page\n"; $wpTextbox1=''; $name_of_page=''; $count_pages++; next; } elsif ($line =~ /^ENDOFFILE/) { #if ($name_of_page=~/^t/) {$wpTextbox1=''; next;}; # TEMPORARY # Fix Word "--" char: $name_of_page =~ s//-/g; $wpTextbox1 =~ s//-/g; # Remove black-listed links: $wpTextbox1 =~ s#google[.]com[/]cse([^ ])*#google.com#ig; # add navigation bar if ($name_of_page =~ /([א-ת ]+ רבה) ([א-ת]+) ([א-ת]+)/) { my $book = $1; my $chapter = $2; my $verse = $3; my $sargel = sargel_niwut($book, $chapter, $verse); $wpTextbox1 = "$sargel\n$wpTextbox1"; } my $short_name_of_page=''; if ($name_of_page=~/^ביאור:/) { ($short_name_of_page = $name_of_page) =~ s/ביאור://; # put shorter names as titles in categories: $wpTextbox1 =~ s/\[\[קטגוריה:([^\|\[\]]*)\]\]/\[\[קטגוריה:$1|$short_name_of_page\]\]/ig; } #print "end $name_of_page \n"; $wpTextbox1 =~ s/\s*(#REDIRECT)/$1/ig; $wpTextbox1 =~ s/\s*(#הפניה)/$1/ig; $wpTextbox1=~s|\[http://he.judaism.wikia.com/wiki/([a-zA-Z0-9\/_-]*) ([^\]]*)\]|[[$1\|$2]]|g; # fix inner links # Only false alarms... #if ($wpTextbox1 =~ /ביאור:ביאור/ || $wpTextbox1=~/שיחה:/) { # die("Deprecated content: $wpTextbox1"); #} print "$name_of_page"; upload_file($name_of_page, $wpTextbox1, ""); print DEBUG "uploaded $name_of_page\n"; $count_pages++; if ($wpTextbox1 =~ /{{קיצור דרך\|([^{}]+)}}/i && $in_file!~/existing/) { my $name_of_shortcut = $1; print "$name_of_shortcut"; (my $redirect_target=$name_of_page) =~ s/^קטגוריה:/:קטגוריה:/; upload_file($name_of_shortcut, "#REDIRECT [[$redirect_target]]" ); print DEBUG "uploaded shortcut $name_of_page\n"; $count_pages++; } $wpTextbox1=''; $name_of_page=''; $short_name_of_page=''; next; } $wpTextbox1.=$line; } print "Uploaded $count_pages pages. Everything seems to be OK. Log was written to $file_for_debug."; print " New content of existing pages was written to $file_for_pages_that_already_exist" unless ($main::REWRITE_EXISTING_PAGES); print "\n\n"; # print DEBUG "_____________ uploads _____________\n"; # print DEBUG @responses; close(DEBUG); close(EXISTING_FILE); close(FAILED_FILE); exit; ######################################################## sub upload_file { my ($name_of_page,$wpTextbox1) = @_; #my $is_redirect = ($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i); #return if (!$is_redirect); # TEMP # Sanity check if (!$name_of_page) { print "Empty page name!\n"; return; } my $response=$browser->get("$main::TARGET_URL/index.php?title=$name_of_page&action=edit"); my $output1 = $response->as_string; if ($output1 =~ /badtitle/i) { $name_of_page =~ s/(.)/sprintf ("%d ", ord($1))/ge; print "Bad title '$name_of_page'\n\n"; return; } if ($output1 =~ /mw-recreate-deleted-warn/ && !$main::REWRITE_EXISTING_PAGES) { print ": deleted - skipping:\n\n"; print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n"; return; } $output1 =~ /value="(\d+)" name="wpStarttime"/; my $wpStarttime=$1; $output1 =~ /value="(\d+)" name="wpEdittime"/; my $wpEdittime=$1; $output1 =~ /value="(.+?)" name="wpEditToken"/; my $wpEditToken=$1; $output1 =~ /name="wpAutoSummary" type="hidden" value="(.+?)"/; my $wpAutoSummary=$1; $output1 =~ /<textarea([^<>]|\r|\n)*>((.|\r|\n)*?)<\/textarea>/m; my $wpOldContent=$2; if ($wpTextbox1 =~ /%%%%%/) { # edit $wpOldContent = htmlspecialchars_decode($wpOldContent); if ($in_file =~ /milon/ && $name_of_page =~ /קטגוריה:([א-ת]+) \(שורש\)/) { # העברת התוכן מדפי הקטגוריות לדפי השורשים בויקימילון my $jorj = $1; my $ot0 = substr($jorj,0,2); #print "jorj=$jorj; ot=$ot0; \n"; die; if ($wpOldContent=~/נטיות פעלים/) { my $newContent = $wpOldContent; $newContent =~ s/קטגוריה:$ot0 \(שורשים\)/קטגוריה:$jorj (שורש)|*/; upload_file("$jorj (שורש)", $newContent, "העברה מדף הקטגוריה של השורש"); $wpTextbox1 = "{{:$jorj (שורש)}}\n<noinclude>[[קטגוריה:$ot0 (שורשים)]]</noinclude>\n"; $summary = "העברת התוכן לדף השורש"; } else { print "$name_of_page already changed - skipping\n"; return; } } elsif ($name_of_page =~ /רבי יונה על/g) { # שינוי שם פירושי רבנו יונה (my $new_name_of_page = $name_of_page) =~ s/רבי יונה/רבנו יונה/g; if ($wpOldContent!~/הפניה/) { my $newContent = $wpOldContent; $newContent =~ s/רבי יונה/רבנו יונה/g; upload_file($new_name_of_page, $newContent, "העברה מהדף [[$name_of_page]]"); $wpTextbox1 = "#הפניה [[$new_name_of_page]]\n"; $summary = "העברה לדף [[$new_name_of_page]]"; } else { print "$name_of_page already changed - skipping\n"; return; } } else { $wpTextbox1 = edit_function($wpOldContent); #print "Current content: " . substr($wpOldContent,0,1000) . "...\n"; # print "New content: " . substr($wpTextbox1,0,1000) . "...\n"; # Sanity check if (!$wpTextbox1) { die "New content is empty - probably a bug in your edit_function!"; } } } elsif ($wpTextbox1 =~ /[+][+][+][+][+]\s+(.*)/s) { # add my $contentToAdd = $1; if ($contentToAdd =~ /קיצור (.*)/) { $contentToAdd = "<noinclude>{{קיצור דרך|$1}}</noinclude>"; my $name_of_shortcut = $1; print " $name_of_shortcut"; upload_file($name_of_shortcut, "#REDIRECT [[$name_of_page]]" ); print DEBUG "uploaded shortcut $name_of_page\n"; $count_pages++; } $wpTextbox1 = htmlspecialchars_decode("{{דרוש שילוב}}\n----\n$contentToAdd\n----\n$wpOldContent"); } else { # upload my $are_both_redirects = ($wpOldContent =~/#הפניה/ || $wpOldContent =~/#redirect/i) && ($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i); if ($wpOldContent && $wpOldContent=~/./ && !$main::REWRITE_EXISTING_PAGES && !$are_both_redirects) { print ": already contains data - skipping:\n\n";# \n$wpOldContent\n"; print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n"; return; } else { # usual upload - do some conversions before uploading: # convert indirect links to wikisource to direct links if ($main::TARGET_URL =~ /wikisource/) { $wpTextbox1 =~ s{\[http://he.wikisource.org/wiki/([^ \]]+) ([^\]]+)]}{[[$1|$2]]}ig; } } } # the upload itself $response=$browser->post("$main::TARGET_URL/index.php?title=$name_of_page&action=submit", @ns_headers,Content_Type=>'form-data',Content=> [ wpStarttime=>$wpStarttime, wpEdittime=>$wpEdittime, wpTextbox1=>"$wpTextbox1", wpSummary=>$summary, wpEditToken=>$wpEditToken, wpAutoSummary=>$wpAutoSummary, wpWatchthis=>'on' ]); push @responses,$response->as_string; if($response->code!=302 && $response->code!=200) { print " Upload failed!\n\n"; print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n"; print DEBUG "$name_of_page failed!\n"; } elsif ($response->content =~ /badtitle/i) { print " Bad title!\n\n"; } elsif ($response->content =~ /ספאם/i) { print " Spam!\n\n"; print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n"; print DEBUG "$name_of_page failed - spam!\n"; } else { #print $response->content; print " $name_of_page Uploaded successfully.\n"; } } # create a navigation bar sub sargel_niwut { my ($book, $part, $chapter) = @_; if ($chapter) { my $previous_chapter = ($chapter eq 'א'? "": Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)-1)); my $next_chapter = Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)+1); return "{{" . "סרגל ניווט" . "|" . $book . "|" . $part . "|" . $previous_chapter . "|" . $chapter . "|" . $next_chapter . "}}"; } else { my $previous_part = ($part eq 'א'? "הקדמה": Hebrew::number2hebrew(Hebrew::hebrew2number($part)-1)); my $next_part = "$part א"; return "{{" . "סרגל ניווט" . "|" . $book . "|" . "" . "|" . $previous_part . "|" . $part . "|" . $next_part . "}}"; } } __END__