#!/usr/bin/perl
##
# dump_wordpress.pl - dump wordpress db -> multimarkdown files
##
# Copyright (C) 2016 by attila <attila@stalphonsos.com>
# 
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
# PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
##
use strict;
use warnings;
use Getopt::Std;
use IO::File;
use DBI;
use vars qw($VERBOSE $VERSION);

$VERSION = '0.1.0';

# c.f. Getopt::Std pod
$Getopt::Std::STANDARD_HELP_VERSION = 1;
sub VERSION_MESSAGE { print STDERR qq|prog v.$VERSION\n|; }
sub HELP_MESSAGE {
	print STDERR <<__HeLP__;
purpose: what
usage: prog [-v] [...] ...
  bool opts:
    -v                       verbose
  opts with args:
    -l errlog                spew errors to errlog instead of stderr
    -D dir                   change working dir before starting
    -P pidfile               write pid file

  usage notes:
    Use SIGINT or SIGTERM to shut down cleanly.
__HeLP__
}
sub usage {
	print STDERR "$0: ERROR: @_\n" if @_;
	HELP_MESSAGE();
	exit(@_ ? 1 : 0);
}

our %opts;
our $outdir;
our $dbname;
our $dsn;
our $table;
our $user;
our $pass;
our $dbh;
our $posts_sql;
our $posts_q;
our $row;
our %posts;
our $rels_table = 'wp_term_relationships';
our $taxo_table = 'wp_term_taxonomy';
our $terms_table = 'wp_terms';

# suck tags for a post id# out of mysql
sub get_tags {
	my($id) = @_;
	my $sql = qq{select term_taxonomy_id as tid from ${rels_table} }.
	    q{where object_id=?};
	my $stmt = $dbh->prepare($sql) or die "prepare($sql): $DBI::errstr";
	my %tags;
	$stmt->execute($id) or die "execute($sql,id=$id): $stmt->errstr";
	while (defined(my $r = $stmt->fetchrow_hashref())) {
		my $taxid = int($r->{'tid'});
		my $sql2 = qq{select * from ${taxo_table} }.
		    q{where term_taxonomy_id=?};
		my $stmt2 = $dbh->prepare($sql2) or
		    die "prepare($sql2): $DBI::errstr";
		$stmt2->execute($taxid) or
		    die "execute($sql2): $stmt2->errstr";
		my $r2 = $stmt2->fetchrow_hashref();
		if ($r2 && $r2->{'taxonomy'} eq 'post_tag') {
			my $termid = int($r2->{'term_id'});
			my $sql3 = qq{select * from ${terms_table} }.
			    q{where term_id=?};
			my $stmt3 = $dbh->prepare($sql3) or
			    die "prepare($sql3): $DBI::errstr";
			$stmt3->execute($termid) or
			    die "execute($sql3,$termid): $stmt3->errstr";
			my $r3 = $stmt3->fetchrow_hashref();
			if ($r3) {
				$tags{$r3->{'name'}}++;
			}
			$stmt3->finish();
		}
		$stmt2->finish();
	}
	$stmt->finish();
	return sort(keys(%tags));
}

# mung vaguely htmlish content into markdown where possible
sub markdownify {
	my($html) = @_;
	my $md = $html;
	$md =~ s,\r,\n,gs;
	$md =~ s,<h(\d).*?>(.*?)</h\1>,"#" x $1." $2 "."#" x $1,gsei;
	$md =~ s,<a href=(['"])(.*?)\1>(.*?)</a>,[$3]($2),gsi;
	$md =~ s,<tt>(.*?)</tt>,`$1`,gsi;
	$md =~ s,<pre [^>]+>,```,gsi;
	$md =~ s,</pre>,```,gsi;
#	$md =~ s,<blockquote>,```,gsi;
#	$md =~ s,</blockquote>,```,gsi;
	$md =~ s,<p>,,gsi;
	$md =~ s,</p>,\n,gsi;
	$md =~ s,(<i>|</i>|<em>|</em>),*,gsi;
	$md =~ s,(<b>|</b>),**,gsi;
	$md =~ s,&nbsp;, ,gsi;
	$md =~ s,&amp;,&,gsi;
	$md =~ s,&gt;,>,gsi;
	$md =~ s,&lt;,<,gsi;
	return $md;
}

# strip all leading and trailing whitespace
sub strip {
	my($str) = @_;
	$str =~ s/(^\s+|\s+$)//gs;
	return $str;
}

# process a single post
sub handle_post {
	my($data) = @_;
	unless (-d "$outdir") {
		mkdir($outdir) or die "mkdir($outdir): $!";
	}
	my $id = int($data->{'ID'});
	my @tags = get_tags($id);
	my $name = $data->{'post_name'};
	my $outfile;
	$name = strip($name);
	if ($name =~ /^(\d+)-revision$/) {
		my $rid = $1;
		if (!exists($posts{$rid})) {
			warn("#$id '$name' refers to nonexistent #$rid\n");
		} else {
			my $base = $posts{$rid} . '-revision';
			$rid = 1;
			$name = sprintf(q{%s-%d},$base,$rid);
			$outfile = join('/',$outdir,"${name}.md");
			while (-f $outfile) {
				++$rid;
				$name = sprintf(q{%s-%d},$base,$rid);
				$outfile = join('/',$outdir,"${name}.md");
			}
		}
	}
	$outfile ||= join('/',$outdir,"${name}.md");
	debug("writing post id $id to $outfile");
	my $fh = IO::File->new("> $outfile") or die "$outfile: $!";
	my $map = [
		post_title => ['Title', '%s'],
		post_name => ['name','%s'],
		post_date => ['Date','%s'],
		post_modified => ['Edit','%s'],
		ID => ['Wordpress ID','%d'],
		post_status => ['Wordpress Status','%s'],
	    ];
	for (my $i = 0; $i < scalar(@$map); $i += 2) {
		my($key,$name,$fmt) = ($map->[$i],$map->[1+$i]->[0],
				       $map->[1+$i]->[1]);
		$fh->write(sprintf(qq{$name: $fmt\n},$data->{$key}));
	}
	$fh->write(sprintf(qq{Tags: %s\n},join(",",@tags))) if @tags;
	$fh->write("\n");
	$fh->write(markdownify($data->{'post_content'})."\n");
	$fh->close();
}

getopts('vd:D:T:u:p:', \%opts);
usage('no directory given') unless @ARGV;
$outdir = shift(@ARGV);
usage('too many arguments given') if @ARGV;
$VERBOSE = $opts{'v'} || 0;
my $debugout = $VERBOSE ? sub { warn("# @_\n"); } : sub { };
sub debug { &$debugout(); }
$dbname = $opts{'d'} || 'wordpress';
$dsn = $opts{'D'} || qq{dbi:mysql:dbname=$dbname};
$table = $opts{'T'} || 'wp_posts';
$user = $opts{'u'};
$pass = $opts{'p'};

$dbh = DBI->connect($dsn, $user, $pass,
			{ RaiseError => 1,
			  PrintError => 1 })
    or die "could not connect to db at $dsn as $user: $DBI::errstr";
debug("connected to $dsn as $user");

$posts_sql = qq{select * from $table order by ID asc};
$posts_q = $dbh->prepare($posts_sql)
    or die "prepare '$posts_sql': $DBI::errstr";
debug("prepared: $posts_sql");
$posts_q->execute()
    or die "execute '$posts_sql': $posts_q->errstr";
while (defined($row = $posts_q->fetchrow_hashref())) {
	handle_post($row);
}
$dbh->disconnect();

exit(0);

##
# Local variables:
# mode: perl
# tab-width: 8
# perl-indent-level: 8
# perl-continued-statement-offset: 4
# indent-tabs-mode: t
# comment-column: 40
# End:
##
