00001 <?php
00002
00003 require( dirname( __FILE__ ) .'/../commandLine.inc' );
00004
00005
00006 if ( count( $args ) < 1 ) {
00007 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00008 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00009 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00010
00011 exit( 1 );
00012 }
00013 $tracker = new TrackBlobs( $args );
00014 $tracker->run();
00015 echo "All done.\n";
00016
00017 class TrackBlobs {
00018 var $clusters, $textClause;
00019 var $doBlobOrphans;
00020 var $trackedBlobs = array();
00021
00022 var $batchSize = 1000;
00023 var $reportingInterval = 10;
00024
00025 function __construct( $clusters ) {
00026 $this->clusters = $clusters;
00027 if ( extension_loaded( 'gmp' ) ) {
00028 $this->doBlobOrphans = true;
00029 foreach ( $clusters as $cluster ) {
00030 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00031 }
00032 } else {
00033 echo "Warning: the gmp extension is needed to find orphan blobs\n";
00034 }
00035 }
00036
00037 function run() {
00038 $this->initTrackingTable();
00039 $this->trackRevisions();
00040 $this->trackOrphanText();
00041 if ( $this->doBlobOrphans ) {
00042 $this->findOrphanBlobs();
00043 }
00044 }
00045
00046 function initTrackingTable() {
00047 $dbw = wfGetDB( DB_MASTER );
00048 if ( $dbw->tableExists( 'blob_tracking' ) ) {
00049 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00050 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00051 }
00052 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
00053 }
00054
00055 function getTextClause() {
00056 if ( !$this->textClause ) {
00057 $dbr = wfGetDB( DB_SLAVE );
00058 $this->textClause = '';
00059 foreach ( $this->clusters as $cluster ) {
00060 if ( $this->textClause != '' ) {
00061 $this->textClause .= ' OR ';
00062 }
00063 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
00064 }
00065 }
00066 return $this->textClause;
00067 }
00068
00069 function interpretPointer( $text ) {
00070 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00071 return false;
00072 }
00073 return array(
00074 'cluster' => $m[1],
00075 'id' => intval( $m[2] ),
00076 'hash' => isset( $m[3] ) ? $m[3] : null
00077 );
00078 }
00079
00083 function trackRevisions() {
00084 $dbw = wfGetDB( DB_MASTER );
00085 $dbr = wfGetDB( DB_SLAVE );
00086
00087 $textClause = $this->getTextClause();
00088 $startId = 0;
00089 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00090 $batchesDone = 0;
00091 $rowsInserted = 0;
00092
00093 echo "Finding revisions...\n";
00094
00095 while ( true ) {
00096 $res = $dbr->select( array( 'revision', 'text' ),
00097 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00098 array(
00099 'rev_id > ' . $dbr->addQuotes( $startId ),
00100 'rev_text_id=old_id',
00101 $textClause,
00102 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00103 ),
00104 __METHOD__,
00105 array(
00106 'ORDER BY' => 'rev_id',
00107 'LIMIT' => $this->batchSize
00108 )
00109 );
00110 if ( !$res->numRows() ) {
00111 break;
00112 }
00113
00114 $insertBatch = array();
00115 foreach ( $res as $row ) {
00116 $startId = $row->rev_id;
00117 $info = $this->interpretPointer( $row->old_text );
00118 if ( !$info ) {
00119 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00120 continue;
00121 }
00122 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00123 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00124 continue;
00125 }
00126 $insertBatch[] = array(
00127 'bt_page' => $row->rev_page,
00128 'bt_rev_id' => $row->rev_id,
00129 'bt_text_id' => $row->old_id,
00130 'bt_cluster' => $info['cluster'],
00131 'bt_blob_id' => $info['id'],
00132 'bt_cgz_hash' => $info['hash']
00133 );
00134 if ( $this->doBlobOrphans ) {
00135 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00136 }
00137 }
00138 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00139 $rowsInserted += count( $insertBatch );
00140
00141 ++$batchesDone;
00142 if ( $batchesDone >= $this->reportingInterval ) {
00143 $batchesDone = 0;
00144 echo "$startId / $endId\n";
00145 wfWaitForSlaves( 5 );
00146 }
00147 }
00148 echo "Found $rowsInserted revisions\n";
00149 }
00150
00156 function trackOrphanText() {
00157 # Wait until the blob_tracking table is available in the slave
00158 $dbw = wfGetDB( DB_MASTER );
00159 $dbr = wfGetDB( DB_SLAVE );
00160 $pos = $dbw->getMasterPos();
00161 $dbr->masterPosWait( $pos, 100000 );
00162
00163 $textClause = $this->getTextClause( $this->clusters );
00164 $startId = 0;
00165 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00166 $rowsInserted = 0;
00167 $batchesDone = 0;
00168
00169 echo "Finding orphan text...\n";
00170
00171 # Scan the text table for orphan text
00172 while ( true ) {
00173 $res = $dbr->select( array( 'text', 'blob_tracking' ),
00174 array( 'old_id', 'old_flags', 'old_text' ),
00175 array(
00176 'old_id>' . $dbr->addQuotes( $startId ),
00177 $textClause,
00178 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00179 'bt_text_id IS NULL'
00180 ),
00181 __METHOD__,
00182 array(
00183 'ORDER BY' => 'old_id',
00184 'LIMIT' => $this->batchSize
00185 ),
00186 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00187 );
00188 $ids = array();
00189 foreach ( $res as $row ) {
00190 $ids[] = $row->old_id;
00191 }
00192
00193 if ( !$res->numRows() ) {
00194 break;
00195 }
00196
00197 $insertBatch = array();
00198 foreach ( $res as $row ) {
00199 $startId = $row->old_id;
00200 $info = $this->interpretPointer( $row->old_text );
00201 if ( !$info ) {
00202 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00203 continue;
00204 }
00205 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00206 echo "Invalid cluster returned in SQL query\n";
00207 continue;
00208 }
00209
00210 $insertBatch[] = array(
00211 'bt_page' => 0,
00212 'bt_rev_id' => 0,
00213 'bt_text_id' => $row->old_id,
00214 'bt_cluster' => $info['cluster'],
00215 'bt_blob_id' => $info['id'],
00216 'bt_cgz_hash' => $info['hash']
00217 );
00218 if ( $this->doBlobOrphans ) {
00219 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00220 }
00221 }
00222 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00223
00224 $rowsInserted += count( $insertBatch );
00225 ++$batchesDone;
00226 if ( $batchesDone >= $this->reportingInterval ) {
00227 $batchesDone = 0;
00228 echo "$startId / $endId\n";
00229 wfWaitForSlaves( 5 );
00230 }
00231 }
00232 echo "Found $rowsInserted orphan text rows\n";
00233 }
00234
00242 function findOrphanBlobs() {
00243 if ( !extension_loaded( 'gmp' ) ) {
00244 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00245 return;
00246 }
00247
00248 $dbw = wfGetDB( DB_MASTER );
00249
00250 foreach ( $this->clusters as $cluster ) {
00251 echo "Searching for orphan blobs in $cluster...\n";
00252 $lb = wfGetLBFactory()->getExternalLB( $cluster );
00253 try {
00254 $extDB = $lb->getConnection( DB_SLAVE );
00255 } catch ( DBConnectionError $e ) {
00256 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00257 echo "No database on $cluster\n";
00258 } else {
00259 echo "Error on $cluster: " . $e->getMessage() . "\n";
00260 }
00261 continue;
00262 }
00263 $table = $extDB->getLBInfo( 'blobs table' );
00264 if ( is_null( $table ) ) {
00265 $table = 'blobs';
00266 }
00267 if ( !$extDB->tableExists( $table ) ) {
00268 echo "No blobs table on cluster $cluster\n";
00269 continue;
00270 }
00271 $startId = 0;
00272 $batchesDone = 0;
00273 $actualBlobs = gmp_init( 0 );
00274 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00275
00276
00277 while ( true ) {
00278 $res = $extDB->select( $table,
00279 array( 'blob_id' ),
00280 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00281 __METHOD__,
00282 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00283 );
00284
00285 if ( !$res->numRows() ) {
00286 break;
00287 }
00288
00289 foreach ( $res as $row ) {
00290 gmp_setbit( $actualBlobs, $row->blob_id );
00291 }
00292 $startId = $row->blob_id;
00293
00294 ++$batchesDone;
00295 if ( $batchesDone >= $this->reportingInterval ) {
00296 $batchesDone = 0;
00297 echo "$startId / $endId\n";
00298 }
00299 }
00300
00301
00302
00303 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00304
00305
00306 $insertBatch = array();
00307 $id = 0;
00308 $numOrphans = 0;
00309 while ( true ) {
00310 $id = gmp_scan1( $orphans, $id );
00311 if ( $id == -1 ) {
00312 break;
00313 }
00314 $insertBatch[] = array(
00315 'bo_cluster' => $cluster,
00316 'bo_blob_id' => $id
00317 );
00318 if ( count( $insertBatch ) > $this->batchSize ) {
00319 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00320 $insertBatch = array();
00321 }
00322
00323 ++$id;
00324 ++$numOrphans;
00325 }
00326 if ( $insertBatch ) {
00327 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00328 }
00329 echo "Found $numOrphans orphan(s) in $cluster\n";
00330 }
00331 }
00332 }