00001 <?php
00025 $originalDir = getcwd();
00026
00027 require_once( dirname(__FILE__) . '/commandLine.inc' );
00028 require_once( 'backup.inc' );
00029
00033 class TextPassDumper extends BackupDumper {
00034 var $prefetch = null;
00035 var $input = "php://stdin";
00036 var $history = WikiExporter::FULL;
00037 var $fetchCount = 0;
00038 var $prefetchCount = 0;
00039
00040 var $failures = 0;
00041 var $maxFailures = 200;
00042 var $failureTimeout = 5;
00043
00044 var $php = "php";
00045 var $spawn = false;
00046 var $spawnProc = false;
00047 var $spawnWrite = false;
00048 var $spawnRead = false;
00049 var $spawnErr = false;
00050
00051 function dump() {
00052 # This shouldn't happen if on console... ;)
00053 header( 'Content-type: text/html; charset=UTF-8' );
00054
00055 # Notice messages will foul up your XML output even if they're
00056 # relatively harmless.
00057 if( ini_get( 'display_errors' ) )
00058 ini_set( 'display_errors', 'stderr' );
00059
00060 $this->initProgress( $this->history );
00061
00062 $this->db = $this->backupDb();
00063
00064 $this->egress = new ExportProgressFilter( $this->sink, $this );
00065
00066 $input = fopen( $this->input, "rt" );
00067 $result = $this->readDump( $input );
00068
00069 if( WikiError::isError( $result ) ) {
00070 wfDie( $result->getMessage() );
00071 }
00072
00073 if( $this->spawnProc ) {
00074 $this->closeSpawn();
00075 }
00076
00077 $this->report( true );
00078 }
00079
00080 function processOption( $opt, $val, $param ) {
00081 $url = $this->processFileOpt( $val, $param );
00082
00083 switch( $opt ) {
00084 case 'prefetch':
00085 global $IP;
00086 require_once "$IP/maintenance/backupPrefetch.inc";
00087 $this->prefetch = new BaseDump( $url );
00088 break;
00089 case 'stub':
00090 $this->input = $url;
00091 break;
00092 case 'current':
00093 $this->history = WikiExporter::CURRENT;
00094 break;
00095 case 'full':
00096 $this->history = WikiExporter::FULL;
00097 break;
00098 case 'spawn':
00099 $this->spawn = true;
00100 if( $val ) {
00101 $this->php = $val;
00102 }
00103 break;
00104 }
00105 }
00106
00107 function processFileOpt( $val, $param ) {
00108 switch( $val ) {
00109 case "file":
00110 return $param;
00111 case "gzip":
00112 return "compress.zlib://$param";
00113 case "bzip2":
00114 return "compress.bzip2://$param";
00115 case "7zip":
00116 return "mediawiki.compress.7z://$param";
00117 default:
00118 return $val;
00119 }
00120 }
00121
00125 function showReport() {
00126 if( !$this->prefetch ) {
00127 return parent::showReport();
00128 }
00129
00130 if( $this->reporting ) {
00131 $delta = wfTime() - $this->startTime;
00132 $now = wfTimestamp( TS_DB );
00133 if( $delta ) {
00134 $rate = $this->pageCount / $delta;
00135 $revrate = $this->revCount / $delta;
00136 $portion = $this->revCount / $this->maxCount;
00137 $eta = $this->startTime + $delta / $portion;
00138 $etats = wfTimestamp( TS_DB, intval( $eta ) );
00139 $fetchrate = 100.0 * $this->prefetchCount / $this->fetchCount;
00140 } else {
00141 $rate = '-';
00142 $revrate = '-';
00143 $etats = '-';
00144 $fetchrate = '-';
00145 }
00146 $this->progress( sprintf( "%s: %s %d pages (%0.3f/sec), %d revs (%0.3f/sec), %0.1f%% prefetched, ETA %s [max %d]",
00147 $now, wfWikiID(), $this->pageCount, $rate, $this->revCount, $revrate, $fetchrate, $etats, $this->maxCount ) );
00148 }
00149 }
00150
00151 function readDump( $input ) {
00152 $this->buffer = "";
00153 $this->openElement = false;
00154 $this->atStart = true;
00155 $this->state = "";
00156 $this->lastName = "";
00157 $this->thisPage = 0;
00158 $this->thisRev = 0;
00159
00160 $parser = xml_parser_create( "UTF-8" );
00161 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
00162
00163 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
00164 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
00165
00166 $offset = 0;
00167 $bufferSize = 512 * 1024;
00168 do {
00169 $chunk = fread( $input, $bufferSize );
00170 if( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
00171 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
00172 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
00173 }
00174 $offset += strlen( $chunk );
00175 } while( $chunk !== false && !feof( $input ) );
00176 xml_parser_free( $parser );
00177
00178 return true;
00179 }
00180
00181 function getText( $id ) {
00182 $this->fetchCount++;
00183 if( isset( $this->prefetch ) ) {
00184 $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
00185 if( $text === null ) {
00186
00187 } elseif( $text === "" ) {
00188
00189
00190 } else {
00191 $this->prefetchCount++;
00192 return $text;
00193 }
00194 }
00195 return $this->doGetText( $id );
00196 }
00197
00198 private function doGetText( $id ) {
00199 if( $this->spawn ) {
00200 return $this->getTextSpawned( $id );
00201 } else {
00202 return $this->getTextDbSafe( $id );
00203 }
00204 }
00205
00211 private function getTextDbSafe( $id ) {
00212 while( true ) {
00213 try {
00214 $text = $this->getTextDb( $id );
00215 $ex = new MWException("Graceful storage failure");
00216 } catch (DBQueryError $ex) {
00217 $text = false;
00218 }
00219 if( $text === false ) {
00220 $this->failures++;
00221 if( $this->failures > $this->maxFailures ) {
00222 throw $ex;
00223 } else {
00224 $this->progress( "Database failure $this->failures " .
00225 "of allowed $this->maxFailures for revision $id! " .
00226 "Pausing $this->failureTimeout seconds..." );
00227 sleep( $this->failureTimeout );
00228 }
00229 } else {
00230 return $text;
00231 }
00232 }
00233 }
00234
00238 private function getTextDb( $id ) {
00239 global $wgContLang;
00240 $id = intval( $id );
00241 $row = $this->db->selectRow( 'text',
00242 array( 'old_text', 'old_flags' ),
00243 array( 'old_id' => $id ),
00244 'TextPassDumper::getText' );
00245 $text = Revision::getRevisionText( $row );
00246 if( $text === false ) {
00247 return false;
00248 }
00249 $stripped = str_replace( "\r", "", $text );
00250 $normalized = $wgContLang->normalize( $stripped );
00251 return $normalized;
00252 }
00253
00254 private function getTextSpawned( $id ) {
00255 wfSuppressWarnings();
00256 if( !$this->spawnProc ) {
00257
00258 $this->openSpawn();
00259 }
00260 while( true ) {
00261
00262 $text = $this->getTextSpawnedOnce( $id );
00263 if( !is_string( $text ) ) {
00264 $this->progress("Database subprocess failed. Respawning...");
00265
00266 $this->closeSpawn();
00267 sleep( $this->failureTimeout );
00268 $this->openSpawn();
00269
00270 continue;
00271 }
00272 wfRestoreWarnings();
00273 return $text;
00274 }
00275 }
00276
00277 function openSpawn() {
00278 global $IP, $wgDBname;
00279
00280 $cmd = implode( " ",
00281 array_map( 'wfEscapeShellArg',
00282 array(
00283 $this->php,
00284 "$IP/maintenance/fetchText.php",
00285 $wgDBname ) ) );
00286 $spec = array(
00287 0 => array( "pipe", "r" ),
00288 1 => array( "pipe", "w" ),
00289 2 => array( "file", "/dev/null", "a" ) );
00290 $pipes = array();
00291
00292 $this->progress( "Spawning database subprocess: $cmd" );
00293 $this->spawnProc = proc_open( $cmd, $spec, $pipes );
00294 if( !$this->spawnProc ) {
00295
00296 $this->progress( "Subprocess spawn failed." );
00297 return false;
00298 }
00299 list(
00300 $this->spawnWrite,
00301 $this->spawnRead,
00302 ) = $pipes;
00303
00304 return true;
00305 }
00306
00307 private function closeSpawn() {
00308 wfSuppressWarnings();
00309 if( $this->spawnRead )
00310 fclose( $this->spawnRead );
00311 $this->spawnRead = false;
00312 if( $this->spawnWrite )
00313 fclose( $this->spawnWrite );
00314 $this->spawnWrite = false;
00315 if( $this->spawnErr )
00316 fclose( $this->spawnErr );
00317 $this->spawnErr = false;
00318 if( $this->spawnProc )
00319 pclose( $this->spawnProc );
00320 $this->spawnProc = false;
00321 wfRestoreWarnings();
00322 }
00323
00324 private function getTextSpawnedOnce( $id ) {
00325 global $wgContLang;
00326
00327 $ok = fwrite( $this->spawnWrite, "$id\n" );
00328
00329 if( !$ok ) return false;
00330
00331 $ok = fflush( $this->spawnWrite );
00332
00333 if( !$ok ) return false;
00334
00335 $len = fgets( $this->spawnRead );
00336
00337 if( $len === false ) return false;
00338
00339 $nbytes = intval( $len );
00340 $text = "";
00341
00342
00343 while( $nbytes > strlen( $text ) ) {
00344 $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
00345 if( $buffer === false ) break;
00346 $text .= $buffer;
00347 }
00348
00349 $gotbytes = strlen( $text );
00350 if( $gotbytes != $nbytes ) {
00351 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes ");
00352 return false;
00353 }
00354
00355
00356 $stripped = str_replace( "\r", "", $text );
00357 $normalized = $wgContLang->normalize( $stripped );
00358 return $normalized;
00359 }
00360
00361 function startElement( $parser, $name, $attribs ) {
00362 $this->clearOpenElement( null );
00363 $this->lastName = $name;
00364
00365 if( $name == 'revision' ) {
00366 $this->state = $name;
00367 $this->egress->writeOpenPage( null, $this->buffer );
00368 $this->buffer = "";
00369 } elseif( $name == 'page' ) {
00370 $this->state = $name;
00371 if( $this->atStart ) {
00372 $this->egress->writeOpenStream( $this->buffer );
00373 $this->buffer = "";
00374 $this->atStart = false;
00375 }
00376 }
00377
00378 if( $name == "text" && isset( $attribs['id'] ) ) {
00379 $text = $this->getText( $attribs['id'] );
00380 $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
00381 if( strlen( $text ) > 0 ) {
00382 $this->characterData( $parser, $text );
00383 }
00384 } else {
00385 $this->openElement = array( $name, $attribs );
00386 }
00387 }
00388
00389 function endElement( $parser, $name ) {
00390 if( $this->openElement ) {
00391 $this->clearOpenElement( "" );
00392 } else {
00393 $this->buffer .= "</$name>";
00394 }
00395
00396 if( $name == 'revision' ) {
00397 $this->egress->writeRevision( null, $this->buffer );
00398 $this->buffer = "";
00399 $this->thisRev = "";
00400 } elseif( $name == 'page' ) {
00401 $this->egress->writeClosePage( $this->buffer );
00402 $this->buffer = "";
00403 $this->thisPage = "";
00404 } elseif( $name == 'mediawiki' ) {
00405 $this->egress->writeCloseStream( $this->buffer );
00406 $this->buffer = "";
00407 }
00408 }
00409
00410 function characterData( $parser, $data ) {
00411 $this->clearOpenElement( null );
00412 if( $this->lastName == "id" ) {
00413 if( $this->state == "revision" ) {
00414 $this->thisRev .= $data;
00415 } elseif( $this->state == "page" ) {
00416 $this->thisPage .= $data;
00417 }
00418 }
00419 $this->buffer .= htmlspecialchars( $data );
00420 }
00421
00422 function clearOpenElement( $style ) {
00423 if( $this->openElement ) {
00424 $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
00425 $this->openElement = false;
00426 }
00427 }
00428 }
00429
00430
00431 $dumper = new TextPassDumper( $argv );
00432
00433 if( true ) {
00434 $dumper->dump();
00435 } else {
00436 $dumper->progress( <<<ENDS
00437 This script postprocesses XML dumps from dumpBackup.php to add
00438 page text which was stubbed out (using --stub).
00439
00440 XML input is accepted on stdin.
00441 XML output is sent to stdout; progress reports are sent to stderr.
00442
00443 Usage: php dumpTextPass.php [<options>]
00444 Options:
00445 --stub=<type>:<file> To load a compressed stub dump instead of stdin
00446 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
00447 pressure on the database.
00448 (Requires PHP 5.0+ and the XMLReader PECL extension)
00449 --quiet Don't dump status reports to stderr.
00450 --report=n Report position and speed after every n pages processed.
00451 (Default: 100)
00452 --server=h Force reading from MySQL server h
00453 --current Base ETA on number of pages in database instead of all revisions
00454 --spawn Spawn a subprocess for loading text records
00455 ENDS
00456 );
00457 }
00458
00459