00001 <?php
00031 require_once( dirname(__FILE__) . '/Maintenance.php' );
00032
00033 class DumpRenderer extends Maintenance {
00034
00035 private $count = 0;
00036 private $outputDirectory, $startTime;
00037
00038 public function __construct() {
00039 parent::__construct();
00040 $this->mDescription = "Take page text out of an XML dump file and render basic HTML out to files";
00041 $this->addOption( 'output-dir', 'The directory to output the HTML files to', true, true );
00042 }
00043
00044 public function execute() {
00045 $this->outputDirectory = $this->getOption( 'output-dir' );
00046 $this->startTime = wfTime();
00047
00048 $source = new ImportStreamSource( $this->getStdin() );
00049 $importer = new WikiImporter( $source );
00050
00051 $importer->setRevisionCallback(
00052 array( &$this, 'handleRevision' ) );
00053
00054 return $importer->doImport();
00055 }
00056
00061 private function handleRevision( $rev ) {
00062 $title = $rev->getTitle();
00063 if (!$title) {
00064 $this->error( "Got bogus revision with null title!" );
00065 return;
00066 }
00067 $display = $title->getPrefixedText();
00068
00069 $this->count++;
00070
00071 $sanitized = rawurlencode( $display );
00072 $filename = sprintf( "%s/wiki-%07d-%s.html",
00073 $this->outputDirectory,
00074 $this->count,
00075 $sanitized );
00076 $this->output( sprintf( $this->stderr, "%s\n", $filename, $display ) );
00077
00078
00079 $user = new User();
00080 $parser = new Parser();
00081 $options = ParserOptions::newFromUser( $user );
00082
00083 $output = $parser->parse( $rev->getText(), $title, $options );
00084
00085 file_put_contents( $filename,
00086 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" " .
00087 "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" .
00088 "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" .
00089 "<head>\n" .
00090 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n" .
00091 "<title>" . htmlspecialchars( $display ) . "</title>\n" .
00092 "</head>\n" .
00093 "<body>\n" .
00094 $output->getText() .
00095 "</body>\n" .
00096 "</html>" );
00097 }
00098 }
00099
00100 $maintClass = "DumpRenderer";
00101 require_once( DO_MAINTENANCE );