#!/usr/bin/perl -w
# Cormac Reynolds 201512: script to do a complete conversion of a pmwiki to a
# dokuwiki namespace.
#
# known (possible) bugs:  single column indentation for 'code' in pmwiki should be two-column in dokuwiki
#                         Additional formatting not permitted in dokuwiki Headings, so should be stripped
#                         Links that have no link text in Pmwiki, should get the original pmwiki link as link text
#                         Mixing bullet points and indents doesn't work the same in dokuwiki as pmwiki. Should probably get
#                           <code> tags.
#                         footnotes are not converted
#                         Linking to external images does not always work.
use File::Path qw(make_path remove_tree);
use File::Copy::Recursive qw(fcopy rcopy dircopy fmove rmove dirmove);
use File::Basename ;

sub fixlink($) {
  # dokuwiki internal links use ':' for namespaces instead of '.' But need to
  # avoid changing external links. And some other complications: spaces in
  # links, main=root, lowercase, attachments.
  my $oldline = $_[0];
  # update links to the new wiki if in external format to old wiki
  $oldline =~ s|${oldwiki}index.php(\?n=)?||g;
  #print $oldline if $1;
  # or if it was just a link to the old wiki root, make it a link to root
  $oldline =~ s|$oldwiki|Main.HomePage|g;
  my $newline = $oldline;

  while ( $oldline =~ m/\[\[(.*?)\]\]/g) {

    # first extract the pm link address
    my $oldlink = $1;
    $oldlink =~ s/\|.*//;
    $oldlink =~ s/^\s+//;

    # now convert the pm address to a safe dokuwiki equivalent
    my $newlink = $oldlink;
    # the pm 'HomePage' is the doku 'start'
    $newlink =~ s/Home\s*Page/start/;
    $newlink =~ tr/A-Z/a-z/;
    # ignore real external links (typically http:// or mailto://)
    if (not $oldlink =~ m|://|) {
      $newlink =~ s/[\.\/]/:/g;
      # remove spaces and underscores within links
      $newlink =~ s/(\w)[\s_]+(\w)/$1$2/g;
      # main is just the root of the new namespace
      if ($newlink =~ s/^main/$namespace/) {
      } elsif ($newlink =~ m/:/ ) {
        $newlink = $namespace . ':' . $newlink;
      }else {
        # must be a relative rather than absolute link
        $newlink = '.:' . $newlink;
      }
      $oldlink = quotemeta($oldlink); 
      $newline =~ s/$oldlink/$newlink/;
    }
  }
  return $newline
}

sub text_replace($$) {
  # brute force substitution of pmwiki syntax for dokuwiki equivalents
  my $theline = $_[0];
  my $nl = $_[1];
  # remove initial keyword
  $theline =~ s/^text=//;
  # Replace pmwiki newlines with normal newline
  $theline =~ s/$nl/\n/g;
  # Header substitutions
  # note dokuwiki can't have links in headers, so strip headers from lines
  # which contain links.
  $theline =~ s/^!+(.*\[\[)/$1/gm; 
  $theline =~ s/^!{5}(.*)$/== $1 ==/gm;
  $theline =~ s/^!{4}(.*)$/=== $1 ===/gm;
  $theline =~ s/^!{3}(.*)$/==== $1 ====/gm;
  $theline =~ s/^!{2}(.*)$/===== $1 =====/gm;
  $theline =~ s/^!{1}(.*)$/====== $1 ======/gm;
  $theline =~ s/\(:title (.*)\s?:\)/====== $1 ======/gm;
  # Ordered lists (up to 5 deep)
  $space2 = " " x 2;
  $space4 = " " x 4;
  $space6 = " " x 6;
  $space8 = " " x 8;
  $space10 = " " x 10;
  $theline =~ s/^\#{5}/$space10- /gm;
  $theline =~ s/^\#{4}/$space8- /gm;
  $theline =~ s/^\#{3}/$space6- /gm;
  $theline =~ s/^\#{2}/$space4- /gm;
  $theline =~ s/^\#{1}/$space2- /gm;
  # Unordered lists (up to 5 deep)
  $theline =~ s/^\*{5}/$space10* /gm;
  $theline =~ s/^\*{4}/$space8* /gm;
  $theline =~ s/^\*{3}/$space6* /gm;
  $theline =~ s/^\*{2}/$space4* /gm;
  $theline =~ s/^\*{1}/$space2* /gm;
  # font size changes not supported in dokuwiki without plugin, so just strip. Must do before bold in case both are used
  $theline =~ s/['\[]\+{1,2}(.+?)\+{1,2}['\]]/$1/gm;
  $theline =~ s/['\[]-{1,2}(.+?)-{1,2}['\]]/$1/gm;
  # Bold + italic
  $theline =~ s/\'{5}(.*)\'{5}/**\/\/$1\/\/**/g;
  # Bold 
  $theline =~ s/\'{3}/**/g;
  # Italic
  $theline =~ s/\`~/\/\//g;
  $theline =~ s/~\'/\/\//g;
  $theline =~ s/\'{2}/\/\//g;
  # Strike through
  $theline =~ s/{-/<del>/g;
  $theline =~ s/-}/<\/del>/g;
  # Underline
  $theline =~ s/{\+/__/g;
  $theline =~ s/\+}/__/g;
  # Subscript
  $theline =~ s/\'_/<sub>/g;
  $theline =~ s/_\'/<\/sub>/g;
  # Superscript
  $theline =~ s/\'\^/<sub>/g;
  $theline =~ s/\^\'/<\/sub>/g;
  # MonoScript 
  $theline =~ s/@@/\'\'/g;
  # code
  $theline =~ s/\[\@/<code>/g;
  $theline =~ s/\@\]/<\/code>/g;
  $theline =~ s/\(\:cell.*\:\)//g;
  $theline =~ s/\(\:cellnr.*\:\)//g;
  $theline =~ s/\(\:tableend.*\:\)//g;
  $theline =~ s/\(\:table.*\:\)//g;
  # Tables
  $theline =~ s/^\|{2}border=.*$//gm;
  $theline =~ s/^\|{2}align=.*$//gm;
  $theline =~ s/^\|{2}width=.*$//gm;
  $theline =~ s/\|{2}\!/^/g;
  $theline =~ s/\|{2}/|/g;
  # Linebreaks
  $theline =~ s/\Q[[<<]]\E/\\/g;
  # pmwiki encodes some special chars. 
  # %3c = < 
  # %25 = %
  $theline =~ s/%3c/</g;
  $theline =~ s/%25/%/g;
  # dokuwiki does not support new window instruction
  $theline =~ s/%newwin%//g;
  # no colour support in standard dokuwiki, so change to boldface. Do this
  # after specially encoded chars.
  $theline =~ s/%\w*%/**/g;
  # attachments (must do before links as they can look similar). Attachments
  # can be with or without square brackets, and filenames can have spaces iff
  # there are are square brackets (ugh!).
  $theline =~ s/\[\[Attach:\s*(.*?)\]\]/{{$1}}/gm;
  $theline =~ s/Attach:\s*(\S+)/{{$1}}/gm;

  # internal links are different, and complicated
  if ($theline =~ m/\[\[/ ) {
    $theline = fixlink($theline);
  }
  return $theline
}

sub restorepage($$$) {
  # this translated more or less verbatim from pmwiki's own version.
  my $nl = $_[2];
  my @diff = split(/$nl/, $_[0]);
  my @oldtext = split(/$nl/, $_[1]);
  my $newtext = ();
  $diff[0] =~ s/diff:\d+:\d+:(minor)?=//;
  #my $nl = 0;
  my $b1 = ();
  foreach my $x (@diff) {
    if ($x =~ /^(\d+)(,(\d+))?([adc])(\d+)/ ) {
      my $a1 = my $a2 = $1;
      if ($3) {$a2=$3};
      $b1 = $5;
      if ($4 eq 'd') {splice(@oldtext,$b1,$a2-$a1+1)};
      if ($4 eq 'c') {splice(@oldtext,$b1-1,$a2-$a1+1)};
      next;
    }
    #my $nlflag = 0;
    if ($x =~ m/^< /) { next }
    if ($x =~ /^> (.*)$/) {
      #$nlflag=0;
      splice(@oldtext,$b1-1,0,$1); $b1++;
    }
    #if ($x eq '\\ No newline at end of file') {$nl=$nlflag};
  }
  #if ($nl) {$newtext = ()};
  $newtext = join($nl, @oldtext);
  return $newtext;
}

#sub printchange($$$$$){
#my ($version, $host, $type, $page, $author) = @_;



use strict;
#
#################################################
# Check Arguments
if( (@ARGV < 1) || (@ARGV > 3) ){
  print "Usage: pm2dokuwiki.pl <pmwiki dir> <dokuwiki dir>\n";
  exit(1);
}
if( !-d $ARGV[0] ){
  print "Directory Not Valid!\n";
  exit(1);
}
#
#################################################

our $oldwiki = 'http://www.atnf.csiro.au/vlbi/wiki(/)?';
our $namespace  = 'lbaops';

my $pm_wiki_dir = $ARGV[0] . '/wiki.d/';
my $pm_upload_dir = $ARGV[0] . '/uploads/';
my @files = <$pm_wiki_dir/*>;

my($outprefix) = $ARGV[1];

# first process the wiki files - recover current text and past revisions
foreach my $file (@files){
  print "$file\n";
  open INPUTFILE, "<", "$file" or die $!;

  # determine the name of the output file
  my $pm_in = basename($file);
  #$tmpin =~ s|.*/||;
  # Main is just the root namespace in dokuwiki
  $pm_in =~ s/Main\.//;
  # the HomePage is start.txt in dokuwiki (hope there's no clash)
  if ($pm_in eq 'HomePage') {
    $pm_in = 'start'
  }
  my @file_tree = split( /\./, $pm_in);
  my ($outputfile) = $outprefix . '/data/pages/' . $namespace . '/';
  #foreach my $component (@file_tree) {
  #  $outputfile .= '/' . $component;
  #}
  $outputfile .= join('/', @file_tree);
  $outputfile =~ tr/A-Z/a-z/;
  # create the output file if it's not there already
  my ($tmppath) = dirname($outputfile);
  make_path ( $tmppath );

  $outputfile .= '.txt';
  my $text = ();
  my $nl = '%0a';
  my %diffs = ();
  my %hosts = ();
  my %authors = ();
  while(my $theline=<INPUTFILE>){
    if( $theline =~ m/^newline=(.*)/ ){
      # the newline character can change but is then declared at top of page
      $nl = quotemeta($1);
    }
    #if( $theline =~ m/^time=(\d+)/ ) {
    #  $currentver = $1;
    #}
    # convert pmwiki syntax to dokuwiki syntax
    #extract the current version
    if( $theline =~ m/^text/ ){
      # save the raw pmwiki text as $text. We'll use it later
      $text = $theline;
      # convert syntax to dokuwiki
      $theline = text_replace($theline, $nl);
      open (OUTPUTFILE, ">$outputfile") or die $!;
      print OUTPUTFILE $theline;
      close OUTPUTFILE;
    }
    if( $theline =~ m/^diff:(\d+?):(\d+?):/ ){
      # apply each of the diffs in succession and write to new attic file
      # note the diffs don't always come in the order they should be applied so
      # extract them all first
      my $diffver = $2;
      $diffs{$diffver} = $theline;
    } 
    if( $theline =~ m/^host:(\d+)=([\d\.]+)$/ ) {
      # store the hostname($2) for this revision ($1)
      $hosts{$1} = $2;
    }
    if( $theline =~ m/^author:(\d+)=([\S]+)$/ ) {
      # store the author($2) for this revision ($1)
      $authors{$1} = $2;
    }
  }
  close INPUTFILE;
  # EOF, so now apply the diffs and save output
  foreach my $diffver (sort {$b cmp $a} keys %diffs) {
    my $theline = $diffs{$diffver};
    #print "diffver=$diffver\n";
    #print "theline=$theline\n";
    my $atticfile = $outputfile;
    $atticfile =~ s|/data/pages/|/data/attic/|;
    $atticfile =~ s|\.txt$|.$diffver.txt|;
    make_path( dirname($atticfile) );
    # update $text to this latest revision (in raw pmwiki format)
    $text = restorepage($theline, $text, $nl);
    $theline = text_replace($text, $nl);

    open (ATTICFILE, ">$atticfile") or die $!;
    print ATTICFILE $theline;
    close ATTICFILE;
    system ("gzip -f $atticfile");
  }

  # open a change file that will record the list of changes
  my $changefile = $outputfile;
  $changefile =~ s|/data/pages/|/data/meta/|;
  $changefile =~ s|\.txt$|.changes|;
  make_path (dirname($changefile));
  open (CHANGEFILE, ">$changefile") or die $!;
  foreach my $diffver (sort keys %diffs) {
    # and write the diff info to the change file
    my $pagename = join(':', $namespace, @file_tree);
    $pagename =~ tr/A-Z/a-z/;
    $pagename =~ s/\s//g;
    #print "host: $hosts{$diffver}";
    #print "author: $authors{$diffver}";
    if (not exists $hosts{$diffver}) {
        $hosts{$diffver} = 'Unknown';
    }
    if (not exists $authors{$diffver}) {
        $authors{$diffver} = 'Unknown';
    }

    my $change = sprintf("%s\t%s\t%s\t%s\t%s\n", $diffver, $hosts{$diffver}, 'pm', $pagename, $authors{$diffver});
    print CHANGEFILE $change;
  }
  close CHANGEFILE;
}

# now let's recover the data files (this is almost trivial except for spaces in filenames..)

my @pm_uploads = <$pm_upload_dir/*>;
my ($doku_media_dir) = $outprefix . '/data/media/' . $namespace;
make_path( $doku_media_dir );
foreach my $dir (@pm_uploads) {
  # files in main go the new namespace  root.
  my $outdir = $doku_media_dir;
  if (not $dir =~ /Main$/) {
    $outdir .= '/' . basename($dir);
  }
  foreach my $upload (<$dir/*> ) {
    my $outfile = $outdir . '/' . basename($upload);
    $outfile =~ tr/A-Z/a-z/;
    $outfile =~ s/\s/_/g;
    make_path( dirname($outfile) );
    fcopy($upload, $outfile);
  }
}
