00001 <?php
00002
00003 $optionsWithArgs = RecompressTracked::getOptionsWithArgs();
00004 require( dirname( __FILE__ ) .'/../commandLine.inc' );
00005
00006 if ( count( $args ) < 1 ) {
00007 echo "Usage: php recompressTracked.php [options] <cluster> [... <cluster>...]
00008 Moves blobs indexed by trackBlobs.php to a specified list of destination clusters, and recompresses them in the process. Restartable.
00009
00010 Options:
00011 --procs <procs> Set the number of child processes (default 1)
00012 --copy-only Copy only, do not update the text table. Restart without this option to complete.
00013 --debug-log <file> Log debugging data to the specified file
00014 --info-log <file> Log progress messages to the specified file
00015 --critical-log <file> Log error messages to the specified file
00016 ";
00017 exit( 1 );
00018 }
00019
00020 $job = RecompressTracked::newFromCommandLine( $args, $options );
00021 $job->execute();
00022
00023 class RecompressTracked {
00024 var $destClusters;
00025 var $batchSize = 1000;
00026 var $orphanBatchSize = 1000;
00027 var $reportingInterval = 10;
00028 var $numProcs = 1;
00029 var $useDiff, $pageBlobClass, $orphanBlobClass;
00030 var $slavePipes, $slaveProcs, $prevSlaveId;
00031 var $copyOnly = false;
00032 var $isChild = false;
00033 var $slaveId = false;
00034 var $noCount = false;
00035 var $debugLog, $infoLog, $criticalLog;
00036 var $store;
00037
00038 static $optionsWithArgs = array( 'procs', 'slave-id', 'debug-log', 'info-log', 'critical-log' );
00039 static $cmdLineOptionMap = array(
00040 'no-count' => 'noCount',
00041 'procs' => 'numProcs',
00042 'copy-only' => 'copyOnly',
00043 'child' => 'isChild',
00044 'slave-id' => 'slaveId',
00045 'debug-log' => 'debugLog',
00046 'info-log' => 'infoLog',
00047 'critical-log' => 'criticalLog',
00048 );
00049
00050 static function getOptionsWithArgs() {
00051 return self::$optionsWithArgs;
00052 }
00053
00054 static function newFromCommandLine( $args, $options ) {
00055 $jobOptions = array( 'destClusters' => $args );
00056 foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
00057 if ( isset( $options[$cmdOption] ) ) {
00058 $jobOptions[$classOption] = $options[$cmdOption];
00059 }
00060 }
00061 return new self( $jobOptions );
00062 }
00063
00064 function __construct( $options ) {
00065 foreach ( $options as $name => $value ) {
00066 $this->$name = $value;
00067 }
00068 $this->store = new ExternalStoreDB;
00069 if ( !$this->isChild ) {
00070 $GLOBALS['wgDebugLogPrefix'] = "RCT M: ";
00071 } elseif ( $this->slaveId !== false ) {
00072 $GLOBALS['wgDebugLogPrefix'] = "RCT {$this->slaveId}: ";
00073 }
00074 $this->useDiff = function_exists( 'xdiff_string_bdiff' );
00075 $this->pageBlobClass = $this->useDiff ? 'DiffHistoryBlob' : 'ConcatenatedGzipHistoryBlob';
00076 $this->orphanBlobClass = 'ConcatenatedGzipHistoryBlob';
00077 }
00078
00079 function debug( $msg ) {
00080 wfDebug( "$msg\n" );
00081 if ( $this->debugLog ) {
00082 $this->logToFile( $msg, $this->debugLog );
00083 }
00084
00085 }
00086
00087 function info( $msg ) {
00088 echo "$msg\n";
00089 if ( $this->infoLog ) {
00090 $this->logToFile( $msg, $this->infoLog );
00091 }
00092 }
00093
00094 function critical( $msg ) {
00095 echo "$msg\n";
00096 if ( $this->criticalLog ) {
00097 $this->logToFile( $msg, $this->criticalLog );
00098 }
00099 }
00100
00101 function logToFile( $msg, $file ) {
00102 $header = '[' . date('d\TH:i:s') . '] ' . wfHostname() . ' ' . posix_getpid();
00103 if ( $this->slaveId !== false ) {
00104 $header .= "({$this->slaveId})";
00105 }
00106 $header .= ' ' . wfWikiID();
00107 wfErrorLog( sprintf( "%-50s %s\n", $header, $msg ), $file );
00108 }
00109
00115 function syncDBs() {
00116 $dbw = wfGetDB( DB_MASTER );
00117 $dbr = wfGetDB( DB_SLAVE );
00118 $pos = $dbw->getMasterPos();
00119 $dbr->masterPosWait( $pos, 100000 );
00120 }
00121
00125 function execute() {
00126 if ( $this->isChild ) {
00127 $this->executeChild();
00128 } else {
00129 $this->executeParent();
00130 }
00131 }
00132
00136 function executeParent() {
00137 if ( !$this->checkTrackingTable() ) {
00138 return;
00139 }
00140
00141 $this->syncDBs();
00142 $this->startSlaveProcs();
00143 $this->doAllPages();
00144 $this->doAllOrphans();
00145 $this->killSlaveProcs();
00146 }
00147
00151 function checkTrackingTable() {
00152 $dbr = wfGetDB( DB_SLAVE );
00153 if ( !$dbr->tableExists( 'blob_tracking' ) ) {
00154 $this->critical( "Error: blob_tracking table does not exist" );
00155 return false;
00156 }
00157 $row = $dbr->selectRow( 'blob_tracking', '*', false, __METHOD__ );
00158 if ( !$row ) {
00159 $this->info( "Warning: blob_tracking table contains no rows, skipping this wiki." );
00160 return false;
00161 }
00162 return true;
00163 }
00164
00171 function startSlaveProcs() {
00172 $cmd = 'php ' . wfEscapeShellArg( __FILE__ );
00173 foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
00174 if ( $cmdOption == 'slave-id' ) {
00175 continue;
00176 } elseif ( in_array( $cmdOption, self::$optionsWithArgs ) && isset( $this->$classOption ) ) {
00177 $cmd .= " --$cmdOption " . wfEscapeShellArg( $this->$classOption );
00178 } elseif ( $this->$classOption ) {
00179 $cmd .= " --$cmdOption";
00180 }
00181 }
00182 $cmd .= ' --child' .
00183 ' --wiki ' . wfEscapeShellArg( wfWikiID() ) .
00184 ' ' . call_user_func_array( 'wfEscapeShellArg', $this->destClusters );
00185
00186 $this->slavePipes = $this->slaveProcs = array();
00187 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00188 $pipes = false;
00189 $spec = array(
00190 array( 'pipe', 'r' ),
00191 array( 'file', 'php://stdout', 'w' ),
00192 array( 'file', 'php://stderr', 'w' )
00193 );
00194 wfSuppressWarnings();
00195 $proc = proc_open( "$cmd --slave-id $i", $spec, $pipes );
00196 wfRestoreWarnings();
00197 if ( !$proc ) {
00198 $this->critical( "Error opening slave process: $cmd" );
00199 exit( 1 );
00200 }
00201 $this->slaveProcs[$i] = $proc;
00202 $this->slavePipes[$i] = $pipes[0];
00203 }
00204 $this->prevSlaveId = -1;
00205 }
00206
00210 function killSlaveProcs() {
00211 $this->info( "Waiting for slave processes to finish..." );
00212 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00213 $this->dispatchToSlave( $i, 'quit' );
00214 }
00215 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00216 $status = proc_close( $this->slaveProcs[$i] );
00217 if ( $status ) {
00218 $this->critical( "Warning: child #$i exited with status $status" );
00219 }
00220 }
00221 $this->info( "Done." );
00222 }
00223
00228 function dispatch( ) {
00229 $args = func_get_args();
00230 $pipes = $this->slavePipes;
00231 $numPipes = stream_select( $x=array(), $pipes, $y=array(), 3600 );
00232 if ( !$numPipes ) {
00233 $this->critical( "Error waiting to write to slaves. Aborting" );
00234 exit( 1 );
00235 }
00236 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00237 $slaveId = ( $i + $this->prevSlaveId + 1 ) % $this->numProcs;
00238 if ( isset( $pipes[$slaveId] ) ) {
00239 $this->prevSlaveId = $slaveId;
00240 $this->dispatchToSlave( $slaveId, $args );
00241 return;
00242 }
00243 }
00244 $this->critical( "Unreachable" );
00245 exit( 1 );
00246 }
00247
00251 function dispatchToSlave( $slaveId, $args ) {
00252 $args = (array)$args;
00253 $cmd = implode( ' ', $args );
00254 fwrite( $this->slavePipes[$slaveId], "$cmd\n" );
00255 }
00256
00260 function doAllPages() {
00261 $dbr = wfGetDB( DB_SLAVE );
00262 $i = 0;
00263 $startId = 0;
00264 if ( $this->noCount ) {
00265 $numPages = '[unknown]';
00266 } else {
00267 $numPages = $dbr->selectField( 'blob_tracking',
00268 'COUNT(DISTINCT bt_page)',
00269 # A condition is required so that this query uses the index
00270 array( 'bt_moved' => 0 ),
00271 __METHOD__
00272 );
00273 }
00274 if ( $this->copyOnly ) {
00275 $this->info( "Copying pages..." );
00276 } else {
00277 $this->info( "Moving pages..." );
00278 }
00279 while ( true ) {
00280 $res = $dbr->select( 'blob_tracking',
00281 array( 'bt_page' ),
00282 array(
00283 'bt_moved' => 0,
00284 'bt_page > ' . $dbr->addQuotes( $startId )
00285 ),
00286 __METHOD__,
00287 array(
00288 'DISTINCT',
00289 'ORDER BY' => 'bt_page',
00290 'LIMIT' => $this->batchSize,
00291 )
00292 );
00293 if ( !$res->numRows() ) {
00294 break;
00295 }
00296 foreach ( $res as $row ) {
00297 $this->dispatch( 'doPage', $row->bt_page );
00298 $i++;
00299 }
00300 $startId = $row->bt_page;
00301 $this->report( 'pages', $i, $numPages );
00302 }
00303 $this->report( 'pages', $i, $numPages );
00304 if ( $this->copyOnly ) {
00305 $this->info( "All page copies queued." );
00306 } else {
00307 $this->info( "All page moves queued." );
00308 }
00309 }
00310
00314 function report( $label, $current, $end ) {
00315 $this->numBatches++;
00316 if ( $current == $end || $this->numBatches >= $this->reportingInterval ) {
00317 $this->numBatches = 0;
00318 $this->info( "$label: $current / $end" );
00319 $this->waitForSlaves();
00320 }
00321 }
00322
00326 function doAllOrphans() {
00327 $dbr = wfGetDB( DB_SLAVE );
00328 $startId = 0;
00329 $i = 0;
00330 if ( $this->noCount ) {
00331 $numOrphans = '[unknown]';
00332 } else {
00333 $numOrphans = $dbr->selectField( 'blob_tracking',
00334 'COUNT(DISTINCT bt_text_id)',
00335 array( 'bt_moved' => 0, 'bt_page' => 0 ),
00336 __METHOD__ );
00337 if ( !$numOrphans ) {
00338 return;
00339 }
00340 }
00341 if ( $this->copyOnly ) {
00342 $this->info( "Copying orphans..." );
00343 } else {
00344 $this->info( "Moving orphans..." );
00345 }
00346
00347 while ( true ) {
00348 $res = $dbr->select( 'blob_tracking',
00349 array( 'bt_text_id' ),
00350 array(
00351 'bt_moved' => 0,
00352 'bt_page' => 0,
00353 'bt_text_id > ' . $dbr->addQuotes( $startId )
00354 ),
00355 __METHOD__,
00356 array(
00357 'DISTINCT',
00358 'ORDER BY' => 'bt_text_id',
00359 'LIMIT' => $this->batchSize
00360 )
00361 );
00362 if ( !$res->numRows() ) {
00363 break;
00364 }
00365 $ids = array();
00366 foreach ( $res as $row ) {
00367 $ids[] = $row->bt_text_id;
00368 $i++;
00369 }
00370
00371
00372
00373 while ( count( $ids ) > $this->orphanBatchSize ) {
00374 $args = array_slice( $ids, 0, $this->orphanBatchSize );
00375 $ids = array_slice( $ids, $this->orphanBatchSize );
00376 array_unshift( $args, 'doOrphanList' );
00377 call_user_func_array( array( $this, 'dispatch' ), $args );
00378 }
00379 if ( count( $ids ) ) {
00380 $args = $ids;
00381 array_unshift( $args, 'doOrphanList' );
00382 call_user_func_array( array( $this, 'dispatch' ), $args );
00383 }
00384
00385 $startId = $row->bt_text_id;
00386 $this->report( 'orphans', $i, $numOrphans );
00387 }
00388 $this->report( 'orphans', $i, $numOrphans );
00389 $this->info( "All orphans queued." );
00390 }
00391
00395 function executeChild() {
00396 $this->debug( 'starting' );
00397 $this->syncDBs();
00398
00399 while ( !feof( STDIN ) ) {
00400 $line = rtrim( fgets( STDIN ) );
00401 if ( $line == '' ) {
00402 continue;
00403 }
00404 $this->debug( $line );
00405 $args = explode( ' ', $line );
00406 $cmd = array_shift( $args );
00407 switch ( $cmd ) {
00408 case 'doPage':
00409 $this->doPage( intval( $args[0] ) );
00410 break;
00411 case 'doOrphanList':
00412 $this->doOrphanList( array_map( 'intval', $args ) );
00413 break;
00414 case 'quit':
00415 return;
00416 }
00417 $this->waitForSlaves();
00418 }
00419 }
00420
00424 function doPage( $pageId ) {
00425 $title = Title::newFromId( $pageId );
00426 if ( $title ) {
00427 $titleText = $title->getPrefixedText();
00428 } else {
00429 $titleText = '[deleted]';
00430 }
00431 $dbr = wfGetDB( DB_SLAVE );
00432
00433
00434 if ( !$this->copyOnly ) {
00435 $this->finishIncompleteMoves( array( 'bt_page' => $pageId ) );
00436 $this->syncDBs();
00437 }
00438
00439 $startId = 0;
00440 $trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
00441
00442 while ( true ) {
00443 $res = $dbr->select(
00444 array( 'blob_tracking', 'text' ),
00445 '*',
00446 array(
00447 'bt_page' => $pageId,
00448 'bt_text_id > ' . $dbr->addQuotes( $startId ),
00449 'bt_moved' => 0,
00450 'bt_new_url IS NULL',
00451 'bt_text_id=old_id',
00452 ),
00453 __METHOD__,
00454 array(
00455 'ORDER BY' => 'bt_text_id',
00456 'LIMIT' => $this->batchSize
00457 )
00458 );
00459 if ( !$res->numRows() ) {
00460 break;
00461 }
00462
00463 $lastTextId = 0;
00464 foreach ( $res as $row ) {
00465 if ( $lastTextId == $row->bt_text_id ) {
00466
00467 continue;
00468 }
00469 $lastTextId = $row->bt_text_id;
00470
00471 $text = Revision::getRevisionText( $row );
00472 if ( $text === false ) {
00473 $this->critical( "Error loading {$row->bt_rev_id}/{$row->bt_text_id}" );
00474 continue;
00475 }
00476
00477
00478 if ( !$trx->addItem( $text, $row->bt_text_id ) ) {
00479 $this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
00480 $trx->commit();
00481 $trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
00482 $this->waitForSlaves();
00483 }
00484 }
00485 $startId = $row->bt_text_id;
00486 }
00487
00488 $this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
00489 $trx->commit();
00490 }
00491
00502 function moveTextRow( $textId, $url ) {
00503 if ( $this->copyOnly ) {
00504 $this->critical( "Internal error: can't call moveTextRow() in --copy-only mode" );
00505 exit( 1 );
00506 }
00507 $dbw = wfGetDB( DB_MASTER );
00508 $dbw->begin();
00509 $dbw->update( 'text',
00510 array(
00511 'old_text' => $url,
00512 'old_flags' => 'external,utf-8',
00513 ),
00514 array(
00515 'old_id' => $textId
00516 ),
00517 __METHOD__
00518 );
00519 $dbw->update( 'blob_tracking',
00520 array( 'bt_moved' => 1 ),
00521 array( 'bt_text_id' => $textId ),
00522 __METHOD__
00523 );
00524 $dbw->commit();
00525 }
00526
00535 function finishIncompleteMoves( $conds ) {
00536 $dbr = wfGetDB( DB_SLAVE );
00537
00538 $startId = 0;
00539 $conds = array_merge( $conds, array(
00540 'bt_moved' => 0,
00541 'bt_new_url IS NOT NULL'
00542 ));
00543 while ( true ) {
00544 $res = $dbr->select( 'blob_tracking',
00545 '*',
00546 array_merge( $conds, array( 'bt_text_id > ' . $dbr->addQuotes( $startId ) ) ),
00547 __METHOD__,
00548 array(
00549 'ORDER BY' => 'bt_text_id',
00550 'LIMIT' => $this->batchSize,
00551 )
00552 );
00553 if ( !$res->numRows() ) {
00554 break;
00555 }
00556 $this->debug( 'Incomplete: ' . $res->numRows() . ' rows' );
00557 foreach ( $res as $row ) {
00558 $this->moveTextRow( $row->bt_text_id, $row->bt_new_url );
00559 if ( $row->bt_text_id % 10 == 0 ) {
00560 $this->waitForSlaves();
00561 }
00562 }
00563 $startId = $row->bt_text_id;
00564 }
00565 }
00566
00570 function getTargetCluster() {
00571 $cluster = next( $this->destClusters );
00572 if ( $cluster === false ) {
00573 $cluster = reset( $this->destClusters );
00574 }
00575 return $cluster;
00576 }
00577
00581 function getExtDB( $cluster ) {
00582 $lb = wfGetLBFactory()->getExternalLB( $cluster );
00583 return $lb->getConnection( DB_MASTER );
00584 }
00585
00589 function doOrphanList( $textIds ) {
00590
00591 if ( !$this->copyOnly ) {
00592 $this->finishIncompleteMoves( array( 'bt_text_id' => $textIds ) );
00593 $this->syncDBs();
00594 }
00595
00596 $trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
00597
00598 $res = wfGetDB( DB_SLAVE )->select(
00599 array( 'text', 'blob_tracking' ),
00600 array( 'old_id', 'old_text', 'old_flags' ),
00601 array(
00602 'old_id' => $textIds,
00603 'bt_text_id=old_id',
00604 'bt_moved' => 0,
00605 ),
00606 __METHOD__,
00607 array( 'DISTINCT' )
00608 );
00609
00610 foreach ( $res as $row ) {
00611 $text = Revision::getRevisionText( $row );
00612 if ( $text === false ) {
00613 $this->critical( "Error: cannot load revision text for old_id=$textId" );
00614 continue;
00615 }
00616
00617 if ( !$trx->addItem( $text, $row->old_id ) ) {
00618 $this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
00619 $trx->commit();
00620 $trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
00621 $this->waitForSlaves();
00622 }
00623 }
00624 $this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
00625 $trx->commit();
00626 }
00627
00631 function waitForSlaves() {
00632 $lb = wfGetLB();
00633 while ( true ) {
00634 list( $host, $maxLag ) = $lb->getMaxLag();
00635 if ( $maxLag < 2 ) {
00636 break;
00637 }
00638 sleep( 5 );
00639 }
00640 }
00641 }
00642
00646 class CgzCopyTransaction {
00647 var $parent;
00648 var $blobClass;
00649 var $cgz;
00650 var $referrers;
00651
00655 function __construct( $parent, $blobClass ) {
00656 $this->blobClass = $blobClass;
00657 $this->cgz = false;
00658 $this->texts = array();
00659 $this->parent = $parent;
00660 }
00661
00666 function addItem( $text, $textId ) {
00667 if ( !$this->cgz ) {
00668 $class = $this->blobClass;
00669 $this->cgz = new $class;
00670 }
00671 $hash = $this->cgz->addItem( $text );
00672 $this->referrers[$textId] = $hash;
00673 $this->texts[$textId] = $text;
00674 return $this->cgz->isHappy();
00675 }
00676
00677 function getSize() {
00678 return count( $this->texts );
00679 }
00680
00684 function recompress() {
00685 $class = $this->blobClass;
00686 $this->cgz = new $class;
00687 $this->referrers = array();
00688 foreach ( $this->texts as $textId => $text ) {
00689 $hash = $this->cgz->addItem( $text );
00690 $this->referrers[$textId] = $hash;
00691 }
00692 }
00693
00699 function commit() {
00700 $originalCount = count( $this->texts );
00701 if ( !$originalCount ) {
00702 return;
00703 }
00704
00705
00706
00707
00708
00709
00710
00711
00712 $dbw = wfGetDB( DB_MASTER );
00713 $dbw->begin();
00714 $res = $dbw->select( 'blob_tracking',
00715 array( 'bt_text_id', 'bt_moved' ),
00716 array( 'bt_text_id' => array_keys( $this->referrers ) ),
00717 __METHOD__, array( 'FOR UPDATE' ) );
00718 $dirty = false;
00719 foreach ( $res as $row ) {
00720 if ( $row->bt_moved ) {
00721 # This row has already been moved, remove it
00722 $this->parent->debug( "TRX: conflict detected in old_id={$row->bt_text_id}" );
00723 unset( $this->texts[$row->bt_text_id] );
00724 $dirty = true;
00725 }
00726 }
00727
00728
00729 if ( $dirty ) {
00730 if ( !count( $this->texts ) ) {
00731
00732 if ( $originalCount > 1 ) {
00733
00734 $this->critical( "Warning: concurrent operation detected, are there two conflicting " .
00735 "processes running, doing the same job?" );
00736 }
00737 return;
00738 }
00739 $this->recompress();
00740 }
00741
00742
00743 $targetCluster = $this->parent->getTargetCluster();
00744 $store = $this->parent->store;
00745 $targetDB = $store->getMaster( $targetCluster );
00746 $targetDB->clearFlag( DBO_TRX );
00747 $targetDB->begin();
00748 $baseUrl = $this->parent->store->store( $targetCluster, serialize( $this->cgz ) );
00749
00750
00751 foreach ( $this->referrers as $textId => $hash ) {
00752 $url = $baseUrl . '/' . $hash;
00753 $dbw->update( 'blob_tracking',
00754 array( 'bt_new_url' => $url ),
00755 array(
00756 'bt_text_id' => $textId,
00757 'bt_moved' => 0, # Check for concurrent conflicting update
00758 ),
00759 __METHOD__
00760 );
00761 }
00762
00763 $targetDB->commit();
00764
00765
00766 $dbw->commit();
00767
00768
00769 if ( !$this->parent->copyOnly ) {
00770 foreach ( $this->referrers as $textId => $hash ) {
00771 $url = $baseUrl . '/' . $hash;
00772 $this->parent->moveTextRow( $textId, $url );
00773 }
00774 }
00775 }
00776 }
00777