00001 <?php
00021 require_once( dirname(__FILE__) . '/Maintenance.php' );
00022
00023 class RefreshLinks extends Maintenance {
00024 public function __construct() {
00025 parent::__construct();
00026 $this->mDescription = "Refresh link tables";
00027 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
00028 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
00029 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
00030 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
00031 $this->addOption( 'm', 'Maximum replication lag', false, true );
00032 $this->addOption( 'e', 'Last page id to refresh', false, true );
00033 $this->addArg( 'start', 'Page_id to start from, default 1', false );
00034 $this->setBatchSize( 100 );
00035 }
00036
00037 public function execute() {
00038 if( !$this->hasOption( 'dfn-only' ) ) {
00039 $start = $this->getArg( 0, 1 );
00040 $new = $this->getOption( 'new-only', false );
00041 $max = $this->getOption( 'm', false );
00042 $end = $this->getOption( 'e', 0 );
00043 $redir = $this->getOption( 'redirects-only', false );
00044 $oldRedir = $this->getOption( 'old-redirects-only', false );
00045 $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
00046 }
00047 $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
00048 }
00049
00059 private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
00060 $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
00061 global $wgUser, $wgParser, $wgUseTidy;
00062
00063 $reportingInterval = 100;
00064 $dbr = wfGetDB( DB_SLAVE );
00065 $start = intval( $start );
00066
00067 # Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
00068 $wgUser->setOption('math', MW_MATH_SOURCE);
00069
00070 # Don't generate extension images (e.g. Timeline)
00071 if( method_exists( $wgParser, "clearTagHooks" ) ) {
00072 $wgParser->clearTagHooks();
00073 }
00074
00075 # Don't use HTML tidy
00076 $wgUseTidy = false;
00077
00078 $what = $redirectsOnly ? "redirects" : "links";
00079
00080 if( $oldRedirectsOnly ) {
00081 # This entire code path is cut-and-pasted from below. Hurrah.
00082 $res = $dbr->query(
00083 "SELECT page_id ".
00084 "FROM page ".
00085 "LEFT JOIN redirect ON page_id=rd_from ".
00086 "WHERE page_is_redirect=1 AND rd_from IS NULL AND ".
00087 ($end == 0 ? "page_id >= $start"
00088 : "page_id BETWEEN $start AND $end"),
00089 __METHOD__
00090 );
00091 $num = $dbr->numRows( $res );
00092 $this->output( "Refreshing $num old redirects from $start...\n" );
00093
00094 foreach( $res as $row ) {
00095 if ( !( ++$i % $reportingInterval ) ) {
00096 $this->output( "$i\n" );
00097 wfWaitForSlaves( $maxLag );
00098 }
00099 $this->fixRedirect( $row->page_id );
00100 }
00101 } elseif( $newOnly ) {
00102 $this->output( "Refreshing $what from " );
00103 $res = $dbr->select( 'page',
00104 array( 'page_id' ),
00105 array(
00106 'page_is_new' => 1,
00107 "page_id >= $start" ),
00108 __METHOD__
00109 );
00110 $num = $dbr->numRows( $res );
00111 $this->output( "$num new articles...\n" );
00112
00113 $i = 0;
00114 foreach ( $res as $row ) {
00115 if ( !( ++$i % $reportingInterval ) ) {
00116 $this->output( "$i\n" );
00117 wfWaitForSlaves( $maxLag );
00118 }
00119 if($redirectsOnly)
00120 $this->fixRedirect( $row->page_id );
00121 else
00122 $this->fixLinksFromArticle( $row->page_id );
00123 }
00124 } else {
00125 if ( !$end ) {
00126 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
00127 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
00128 $end = max( $maxPage, $maxRD );
00129 }
00130 $this->output( "Refreshing redirects table.\n" );
00131 $this->output( "Starting from page_id $start of $end.\n" );
00132
00133 for ($id = $start; $id <= $end; $id++) {
00134
00135 if ( !($id % $reportingInterval) ) {
00136 $this->output( "$id\n" );
00137 wfWaitForSlaves( $maxLag );
00138 }
00139 $this->fixRedirect( $id );
00140 }
00141
00142 if(!$redirectsOnly) {
00143 $this->output( "Refreshing links table.\n" );
00144 $this->output( "Starting from page_id $start of $end.\n" );
00145
00146 for ($id = $start; $id <= $end; $id++) {
00147
00148 if ( !($id % $reportingInterval) ) {
00149 $this->output( "$id\n" );
00150 wfWaitForSlaves( $maxLag );
00151 }
00152 $this->fixLinksFromArticle( $id );
00153 }
00154 }
00155 }
00156 }
00157
00162 private function fixRedirect( $id ){
00163 global $wgTitle, $wgArticle;
00164
00165 $wgTitle = Title::newFromID( $id );
00166 $dbw = wfGetDB( DB_MASTER );
00167
00168 if ( is_null( $wgTitle ) ) {
00169
00170
00171 $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00172 __METHOD__ );
00173 return;
00174 }
00175 $wgArticle = new Article($wgTitle);
00176
00177 $rt = $wgArticle->followRedirect();
00178
00179 if($rt == false || !is_object($rt)) {
00180
00181
00182 $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00183 __METHOD__ );
00184 } else {
00185 $wgArticle->updateRedirectOn($dbw,$rt);
00186 }
00187 }
00188
00193 private function fixLinksFromArticle( $id ) {
00194 global $wgTitle, $wgParser;
00195
00196 $wgTitle = Title::newFromID( $id );
00197 $dbw = wfGetDB( DB_MASTER );
00198
00199 $linkCache =& LinkCache::singleton();
00200 $linkCache->clear();
00201
00202 if ( is_null( $wgTitle ) ) {
00203 return;
00204 }
00205 $dbw->begin();
00206
00207 $revision = Revision::newFromTitle( $wgTitle );
00208 if ( !$revision ) {
00209 return;
00210 }
00211
00212 $options = new ParserOptions;
00213 $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
00214 $update = new LinksUpdate( $wgTitle, $parserOutput, false );
00215 $update->doUpdate();
00216 $dbw->commit();
00217 }
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228 private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
00229 wfWaitForSlaves( $maxLag );
00230
00231 $dbw = wfGetDB( DB_MASTER );
00232
00233 $lb = wfGetLBFactory()->newMainLB();
00234 $dbr = $lb->getConnection( DB_SLAVE );
00235 $dbr->bufferResults( false );
00236
00237 $linksTables = array(
00238 'pagelinks' => 'pl_from',
00239 'imagelinks' => 'il_from',
00240 'categorylinks' => 'cl_from',
00241 'templatelinks' => 'tl_from',
00242 'externallinks' => 'el_from',
00243 );
00244
00245 foreach ( $linksTables as $table => $field ) {
00246 $this->output( "Retrieving illegal entries from $table... " );
00247
00248
00249 $results = $dbr->select( array( $table, 'page' ),
00250 $field,
00251 array('page_id' => null ),
00252 __METHOD__,
00253 'DISTINCT',
00254 array( 'page' => array( 'LEFT JOIN', "$field=page_id"))
00255 );
00256
00257 $counter = 0;
00258 $list = array();
00259 $this->output( "0.." );
00260
00261 foreach( $results as $row ) {
00262 $counter++;
00263 $list[] = $row->$field;
00264 if ( ( $counter % $batchSize ) == 0 ) {
00265 wfWaitForSlaves(5);
00266 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00267
00268 $this->output( $counter . ".." );
00269 $list = array();
00270 }
00271 }
00272 $this->output( $counter );
00273 if (count($list) > 0) {
00274 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00275 }
00276 $this->output( "\n" );
00277 }
00278 $lb->closeAll();
00279 }
00280 }
00281
00282 $maintClass = 'RefreshLinks';
00283 require_once( DO_MAINTENANCE );