00001 <?php
00031 define( 'MW_CHAR_REFS_REGEX',
00032 '/&([A-Za-z0-9\x80-\xff]+);
00033 |&\#([0-9]+);
00034 |&\#x([0-9A-Za-z]+);
00035 |&\#X([0-9A-Za-z]+);
00036 |(&)/x' );
00037
00043 $attrib = '[A-Za-z0-9]';
00044 $space = '[\x09\x0a\x0d\x20]';
00045 define( 'MW_ATTRIBS_REGEX',
00046 "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
00047 ($space*=$space*
00048 (?:
00049 # The attribute value: quoted or alone
00050 \"([^<\"]*)\"
00051 | '([^<']*)'
00052 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00053 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00054 # colors are specified like this.
00055 # We'll be normalizing it.
00056 )
00057 )?(?=$space|\$)/sx" );
00058
00062 define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
00063
00067 define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
00068
00074 global $wgHtmlEntities;
00075 $wgHtmlEntities = array(
00076 'Aacute' => 193,
00077 'aacute' => 225,
00078 'Acirc' => 194,
00079 'acirc' => 226,
00080 'acute' => 180,
00081 'AElig' => 198,
00082 'aelig' => 230,
00083 'Agrave' => 192,
00084 'agrave' => 224,
00085 'alefsym' => 8501,
00086 'Alpha' => 913,
00087 'alpha' => 945,
00088 'amp' => 38,
00089 'and' => 8743,
00090 'ang' => 8736,
00091 'Aring' => 197,
00092 'aring' => 229,
00093 'asymp' => 8776,
00094 'Atilde' => 195,
00095 'atilde' => 227,
00096 'Auml' => 196,
00097 'auml' => 228,
00098 'bdquo' => 8222,
00099 'Beta' => 914,
00100 'beta' => 946,
00101 'brvbar' => 166,
00102 'bull' => 8226,
00103 'cap' => 8745,
00104 'Ccedil' => 199,
00105 'ccedil' => 231,
00106 'cedil' => 184,
00107 'cent' => 162,
00108 'Chi' => 935,
00109 'chi' => 967,
00110 'circ' => 710,
00111 'clubs' => 9827,
00112 'cong' => 8773,
00113 'copy' => 169,
00114 'crarr' => 8629,
00115 'cup' => 8746,
00116 'curren' => 164,
00117 'dagger' => 8224,
00118 'Dagger' => 8225,
00119 'darr' => 8595,
00120 'dArr' => 8659,
00121 'deg' => 176,
00122 'Delta' => 916,
00123 'delta' => 948,
00124 'diams' => 9830,
00125 'divide' => 247,
00126 'Eacute' => 201,
00127 'eacute' => 233,
00128 'Ecirc' => 202,
00129 'ecirc' => 234,
00130 'Egrave' => 200,
00131 'egrave' => 232,
00132 'empty' => 8709,
00133 'emsp' => 8195,
00134 'ensp' => 8194,
00135 'Epsilon' => 917,
00136 'epsilon' => 949,
00137 'equiv' => 8801,
00138 'Eta' => 919,
00139 'eta' => 951,
00140 'ETH' => 208,
00141 'eth' => 240,
00142 'Euml' => 203,
00143 'euml' => 235,
00144 'euro' => 8364,
00145 'exist' => 8707,
00146 'fnof' => 402,
00147 'forall' => 8704,
00148 'frac12' => 189,
00149 'frac14' => 188,
00150 'frac34' => 190,
00151 'frasl' => 8260,
00152 'Gamma' => 915,
00153 'gamma' => 947,
00154 'ge' => 8805,
00155 'gt' => 62,
00156 'harr' => 8596,
00157 'hArr' => 8660,
00158 'hearts' => 9829,
00159 'hellip' => 8230,
00160 'Iacute' => 205,
00161 'iacute' => 237,
00162 'Icirc' => 206,
00163 'icirc' => 238,
00164 'iexcl' => 161,
00165 'Igrave' => 204,
00166 'igrave' => 236,
00167 'image' => 8465,
00168 'infin' => 8734,
00169 'int' => 8747,
00170 'Iota' => 921,
00171 'iota' => 953,
00172 'iquest' => 191,
00173 'isin' => 8712,
00174 'Iuml' => 207,
00175 'iuml' => 239,
00176 'Kappa' => 922,
00177 'kappa' => 954,
00178 'Lambda' => 923,
00179 'lambda' => 955,
00180 'lang' => 9001,
00181 'laquo' => 171,
00182 'larr' => 8592,
00183 'lArr' => 8656,
00184 'lceil' => 8968,
00185 'ldquo' => 8220,
00186 'le' => 8804,
00187 'lfloor' => 8970,
00188 'lowast' => 8727,
00189 'loz' => 9674,
00190 'lrm' => 8206,
00191 'lsaquo' => 8249,
00192 'lsquo' => 8216,
00193 'lt' => 60,
00194 'macr' => 175,
00195 'mdash' => 8212,
00196 'micro' => 181,
00197 'middot' => 183,
00198 'minus' => 8722,
00199 'Mu' => 924,
00200 'mu' => 956,
00201 'nabla' => 8711,
00202 'nbsp' => 160,
00203 'ndash' => 8211,
00204 'ne' => 8800,
00205 'ni' => 8715,
00206 'not' => 172,
00207 'notin' => 8713,
00208 'nsub' => 8836,
00209 'Ntilde' => 209,
00210 'ntilde' => 241,
00211 'Nu' => 925,
00212 'nu' => 957,
00213 'Oacute' => 211,
00214 'oacute' => 243,
00215 'Ocirc' => 212,
00216 'ocirc' => 244,
00217 'OElig' => 338,
00218 'oelig' => 339,
00219 'Ograve' => 210,
00220 'ograve' => 242,
00221 'oline' => 8254,
00222 'Omega' => 937,
00223 'omega' => 969,
00224 'Omicron' => 927,
00225 'omicron' => 959,
00226 'oplus' => 8853,
00227 'or' => 8744,
00228 'ordf' => 170,
00229 'ordm' => 186,
00230 'Oslash' => 216,
00231 'oslash' => 248,
00232 'Otilde' => 213,
00233 'otilde' => 245,
00234 'otimes' => 8855,
00235 'Ouml' => 214,
00236 'ouml' => 246,
00237 'para' => 182,
00238 'part' => 8706,
00239 'permil' => 8240,
00240 'perp' => 8869,
00241 'Phi' => 934,
00242 'phi' => 966,
00243 'Pi' => 928,
00244 'pi' => 960,
00245 'piv' => 982,
00246 'plusmn' => 177,
00247 'pound' => 163,
00248 'prime' => 8242,
00249 'Prime' => 8243,
00250 'prod' => 8719,
00251 'prop' => 8733,
00252 'Psi' => 936,
00253 'psi' => 968,
00254 'quot' => 34,
00255 'radic' => 8730,
00256 'rang' => 9002,
00257 'raquo' => 187,
00258 'rarr' => 8594,
00259 'rArr' => 8658,
00260 'rceil' => 8969,
00261 'rdquo' => 8221,
00262 'real' => 8476,
00263 'reg' => 174,
00264 'rfloor' => 8971,
00265 'Rho' => 929,
00266 'rho' => 961,
00267 'rlm' => 8207,
00268 'rsaquo' => 8250,
00269 'rsquo' => 8217,
00270 'sbquo' => 8218,
00271 'Scaron' => 352,
00272 'scaron' => 353,
00273 'sdot' => 8901,
00274 'sect' => 167,
00275 'shy' => 173,
00276 'Sigma' => 931,
00277 'sigma' => 963,
00278 'sigmaf' => 962,
00279 'sim' => 8764,
00280 'spades' => 9824,
00281 'sub' => 8834,
00282 'sube' => 8838,
00283 'sum' => 8721,
00284 'sup' => 8835,
00285 'sup1' => 185,
00286 'sup2' => 178,
00287 'sup3' => 179,
00288 'supe' => 8839,
00289 'szlig' => 223,
00290 'Tau' => 932,
00291 'tau' => 964,
00292 'there4' => 8756,
00293 'Theta' => 920,
00294 'theta' => 952,
00295 'thetasym' => 977,
00296 'thinsp' => 8201,
00297 'THORN' => 222,
00298 'thorn' => 254,
00299 'tilde' => 732,
00300 'times' => 215,
00301 'trade' => 8482,
00302 'Uacute' => 218,
00303 'uacute' => 250,
00304 'uarr' => 8593,
00305 'uArr' => 8657,
00306 'Ucirc' => 219,
00307 'ucirc' => 251,
00308 'Ugrave' => 217,
00309 'ugrave' => 249,
00310 'uml' => 168,
00311 'upsih' => 978,
00312 'Upsilon' => 933,
00313 'upsilon' => 965,
00314 'Uuml' => 220,
00315 'uuml' => 252,
00316 'weierp' => 8472,
00317 'Xi' => 926,
00318 'xi' => 958,
00319 'Yacute' => 221,
00320 'yacute' => 253,
00321 'yen' => 165,
00322 'Yuml' => 376,
00323 'yuml' => 255,
00324 'Zeta' => 918,
00325 'zeta' => 950,
00326 'zwj' => 8205,
00327 'zwnj' => 8204 );
00328
00332 global $wgHtmlEntityAliases;
00333 $wgHtmlEntityAliases = array(
00334 'רלמ' => 'rlm',
00335 'رلم' => 'rlm',
00336 );
00337
00338
00343 class Sanitizer {
00355 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00356 global $wgUseTidy;
00357
00358 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00359 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00360
00361 wfProfileIn( __METHOD__ );
00362
00363 if ( !$staticInitialised ) {
00364
00365 $htmlpairsStatic = array( # Tags that must be closed
00366 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00367 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00368 'strike', 'strong', 'tt', 'var', 'div', 'center',
00369 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00370 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
00371 );
00372 $htmlsingle = array(
00373 'br', 'hr', 'li', 'dt', 'dd'
00374 );
00375 $htmlsingleonly = array( # Elements that cannot have close tags
00376 'br', 'hr'
00377 );
00378 $htmlnest = array( # Tags that can be nested--??
00379 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00380 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00381 );
00382 $tabletags = array( # Can only appear inside table, we will close them
00383 'td', 'th', 'tr',
00384 );
00385 $htmllist = array( # Tags used by list
00386 'ul','ol',
00387 );
00388 $listtags = array( # Tags that can appear in a list
00389 'li',
00390 );
00391
00392 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00393 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00394
00395 # Convert them all to hashtables for faster lookup
00396 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00397 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00398 foreach ( $vars as $var ) {
00399 $$var = array_flip( $$var );
00400 }
00401 $staticInitialised = true;
00402 }
00403 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00404 $extratags = array_flip( $extratags );
00405 $removetags = array_flip( $removetags );
00406 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00407 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
00408
00409 # Remove HTML comments
00410 $text = Sanitizer::removeHTMLcomments( $text );
00411 $bits = explode( '<', $text );
00412 $text = str_replace( '>', '>', array_shift( $bits ) );
00413 if ( !$wgUseTidy ) {
00414 $tagstack = $tablestack = array();
00415 foreach ( $bits as $x ) {
00416 $regs = array();
00417 # $slash: Does the current element start with a '/'?
00418 # $t: Current element name
00419 # $params: String between element name and >
00420 # $brace: Ending '>' or '/>'
00421 # $rest: Everything until the next element of $bits
00422 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00423 list( , $slash, $t, $params, $brace, $rest ) = $regs;
00424 } else {
00425 $slash = $t = $params = $brace = $rest = null;
00426 }
00427
00428 $badtag = false;
00429 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00430 # Check our stack
00431 if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00432 $badtag = true;
00433 } elseif ( $slash ) {
00434 # Closing a tag... is it the one we just opened?
00435 $ot = @array_pop( $tagstack );
00436 if ( $ot != $t ) {
00437 if ( isset( $htmlsingleallowed[$ot] ) ) {
00438 # Pop all elements with an optional close tag
00439 # and see if we find a match below them
00440 $optstack = array();
00441 array_push( $optstack, $ot );
00442 $ot = @array_pop( $tagstack );
00443 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00444 array_push( $optstack, $ot );
00445 $ot = @array_pop( $tagstack );
00446 }
00447 if ( $t != $ot ) {
00448 # No match. Push the optional elements back again
00449 $badtag = true;
00450 while ( $ot = @array_pop( $optstack ) ) {
00451 array_push( $tagstack, $ot );
00452 }
00453 }
00454 } else {
00455 @array_push( $tagstack, $ot );
00456 # <li> can be nested in <ul> or <ol>, skip those cases:
00457 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00458 $badtag = true;
00459 }
00460 }
00461 } else {
00462 if ( $t == 'table' ) {
00463 $tagstack = array_pop( $tablestack );
00464 }
00465 }
00466 $newparams = '';
00467 } else {
00468 # Keep track for later
00469 if ( isset( $tabletags[$t] ) &&
00470 !in_array( 'table', $tagstack ) ) {
00471 $badtag = true;
00472 } elseif ( in_array( $t, $tagstack ) &&
00473 !isset( $htmlnest [$t ] ) ) {
00474 $badtag = true;
00475 # Is it a self closed htmlpair ? (bug 5487)
00476 } elseif ( $brace == '/>' &&
00477 isset( $htmlpairs[$t] ) ) {
00478 $badtag = true;
00479 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00480 # Hack to force empty tag for uncloseable elements
00481 $brace = '/>';
00482 } elseif ( isset( $htmlsingle[$t] ) ) {
00483 # Hack to not close $htmlsingle tags
00484 $brace = null;
00485 } elseif ( isset( $tabletags[$t] )
00486 && in_array( $t, $tagstack ) ) {
00487
00488 $text .= "</$t>";
00489 } else {
00490 if ( $t == 'table' ) {
00491 array_push( $tablestack, $tagstack );
00492 $tagstack = array();
00493 }
00494 array_push( $tagstack, $t );
00495 }
00496
00497 # Replace any variables or template parameters with
00498 # plaintext results.
00499 if( is_callable( $processCallback ) ) {
00500 call_user_func_array( $processCallback, array( &$params, $args ) );
00501 }
00502
00503 # Strip non-approved attributes from the tag
00504 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00505 }
00506 if ( !$badtag ) {
00507 $rest = str_replace( '>', '>', $rest );
00508 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00509 $text .= "<$slash$t$newparams$close>$rest";
00510 continue;
00511 }
00512 }
00513 $text .= '<' . str_replace( '>', '>', $x);
00514 }
00515 # Close off any remaining tags
00516 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00517 $text .= "</$t>\n";
00518 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00519 }
00520 } else {
00521 # this might be possible using tidy itself
00522 foreach ( $bits as $x ) {
00523 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00524 $x, $regs );
00525 @list( , $slash, $t, $params, $brace, $rest ) = $regs;
00526 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00527 if( is_callable( $processCallback ) ) {
00528 call_user_func_array( $processCallback, array( &$params, $args ) );
00529 }
00530 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00531 $rest = str_replace( '>', '>', $rest );
00532 $text .= "<$slash$t$newparams$brace$rest";
00533 } else {
00534 $text .= '<' . str_replace( '>', '>', $x);
00535 }
00536 }
00537 }
00538 wfProfileOut( __METHOD__ );
00539 return $text;
00540 }
00541
00552 static function removeHTMLcomments( $text ) {
00553 wfProfileIn( __METHOD__ );
00554 while (($start = strpos($text, '<!--')) !== false) {
00555 $end = strpos($text, '-->', $start + 4);
00556 if ($end === false) {
00557 # Unterminated comment; bail out
00558 break;
00559 }
00560
00561 $end += 3;
00562
00563 # Trim space and newline if the comment is both
00564 # preceded and followed by a newline
00565 $spaceStart = max($start - 1, 0);
00566 $spaceLen = $end - $spaceStart;
00567 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00568 $spaceStart--;
00569 $spaceLen++;
00570 }
00571 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00572 $spaceLen++;
00573 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00574 # Remove the comment, leading and trailing
00575 # spaces, and leave only one newline.
00576 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00577 }
00578 else {
00579 # Remove just the comment.
00580 $text = substr_replace($text, '', $start, $end - $start);
00581 }
00582 }
00583 wfProfileOut( __METHOD__ );
00584 return $text;
00585 }
00586
00602 static function validateTagAttributes( $attribs, $element ) {
00603 return Sanitizer::validateAttributes( $attribs,
00604 Sanitizer::attributeWhitelist( $element ) );
00605 }
00606
00622 static function validateAttributes( $attribs, $whitelist ) {
00623 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00624
00625 $whitelist = array_flip( $whitelist );
00626 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00627
00628 $out = array();
00629 foreach( $attribs as $attribute => $value ) {
00630 #allow XML namespace declaration if RDFa is enabled
00631 if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
00632 if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
00633 $out[$attribute] = $value;
00634 }
00635
00636 continue;
00637 }
00638
00639 if( !isset( $whitelist[$attribute] ) ) {
00640 continue;
00641 }
00642
00643 # Strip javascript "expression" from stylesheets.
00644 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00645 if( $attribute == 'style' ) {
00646 $value = Sanitizer::checkCss( $value );
00647 }
00648
00649 if ( $attribute === 'id' ) {
00650 $value = Sanitizer::escapeId( $value, 'noninitial' );
00651 }
00652
00653
00654 if ( $attribute === 'rel' || $attribute === 'rev' ||
00655 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00656 $attribute === 'datatype' || $attribute === 'typeof' || #RDFa
00657 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00658 $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata
00659
00660
00661 if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
00662 continue;
00663 }
00664 }
00665
00666 # NOTE: even though elements using href/src are not allowed directly, supply
00667 # validation code that can be used by tag hook handlers, etc
00668 if ( $attribute === 'href' || $attribute === 'src' ) {
00669 if ( !preg_match( $hrefExp, $value ) ) {
00670 continue;
00671
00672 }
00673 }
00674
00675
00676
00677 $out[$attribute] = $value;
00678 }
00679
00680 if ( $wgAllowMicrodataAttributes ) {
00681 # There are some complicated validity constraints we need to
00682 # enforce here. First of all, we don't want to allow non-standard
00683 # itemtypes.
00684 $allowedTypes = array(
00685 'http://microformats.org/profile/hcard',
00686 'http://microformats.org/profile/hcalendar#vevent',
00687 'http://n.whatwg.org/work',
00688 );
00689 if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
00690 $allowedTypes ) ) {
00691 # Kill everything
00692 unset( $out['itemscope'] );
00693 }
00694 # itemtype, itemid, itemref don't make sense without itemscope
00695 if ( !array_key_exists( 'itemscope', $out ) ) {
00696 unset( $out['itemtype'] );
00697 unset( $out['itemid'] );
00698 unset( $out['itemref'] );
00699 }
00700 # TODO: Strip itemprop if we aren't descendants of an itemscope.
00701 }
00702 return $out;
00703 }
00704
00715 static function mergeAttributes( $a, $b ) {
00716 $out = array_merge( $a, $b );
00717 if( isset( $a['class'] ) && isset( $b['class'] )
00718 && is_string( $a['class'] ) && is_string( $b['class'] )
00719 && $a['class'] !== $b['class'] ) {
00720 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00721 -1, PREG_SPLIT_NO_EMPTY );
00722 $out['class'] = implode( ' ', array_unique( $classes ) );
00723 }
00724 return $out;
00725 }
00726
00744 static function checkCss( $value ) {
00745
00746 $value = Sanitizer::decodeCharReferences( $value );
00747
00748
00749
00750
00751
00752
00753
00754
00755
00756
00757 static $decodeRegex;
00758 if ( !$decodeRegex ) {
00759 $space = '[\\x20\\t\\r\\n\\f]';
00760 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00761 $backslash = '\\\\';
00762 $decodeRegex = "/ $backslash
00763 (?:
00764 ($nl) | # 1. Line continuation
00765 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
00766 (.) | # 3. backslash cancelling special meaning
00767 () | # 4. backslash at end of string
00768 )/xu";
00769 }
00770 $value = preg_replace_callback( $decodeRegex,
00771 array( __CLASS__, 'cssDecodeCallback' ), $value );
00772
00773
00774
00775
00776
00777
00778
00779 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00780
00781
00782
00783 $commentPos = strpos( $value, '/*' );
00784 if ( $commentPos !== false ) {
00785 $value = substr( $value, 0, $commentPos );
00786 }
00787
00788
00789 if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
00790 return '/* invalid control char */';
00791 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
00792 return '/* insecure input */';
00793 }
00794 return $value;
00795 }
00796
00797 static function cssDecodeCallback( $matches ) {
00798 if ( $matches[1] !== '' ) {
00799
00800 return '';
00801 } elseif ( $matches[2] !== '' ) {
00802 $char = codepointToUtf8( hexdec( $matches[2] ) );
00803 } elseif ( $matches[3] !== '' ) {
00804 $char = $matches[3];
00805 } else {
00806 $char = '\\';
00807 }
00808 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00809
00810
00811 return '\\' . dechex( ord( $char ) ) . ' ';
00812 } else {
00813
00814 return $char;
00815 }
00816 }
00817
00837 static function fixTagAttributes( $text, $element ) {
00838 if( trim( $text ) == '' ) {
00839 return '';
00840 }
00841
00842 $stripped = Sanitizer::validateTagAttributes(
00843 Sanitizer::decodeTagAttributes( $text ), $element );
00844
00845 $attribs = array();
00846 foreach( $stripped as $attribute => $value ) {
00847 $encAttribute = htmlspecialchars( $attribute );
00848 $encValue = Sanitizer::safeEncodeAttribute( $value );
00849
00850 $attribs[] = "$encAttribute=\"$encValue\"";
00851 }
00852 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00853 }
00854
00860 static function encodeAttribute( $text ) {
00861 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00862
00863
00864
00865
00866 $encValue = strtr( $encValue, array(
00867 "\n" => ' ',
00868 "\r" => ' ',
00869 "\t" => '	',
00870 ) );
00871
00872 return $encValue;
00873 }
00874
00881 static function safeEncodeAttribute( $text ) {
00882 $encValue = Sanitizer::encodeAttribute( $text );
00883
00884 # Templates and links may be expanded in later parsing,
00885 # creating invalid or dangerous output. Suppress this.
00886 $encValue = strtr( $encValue, array(
00887 '<' => '<',
00888 '>' => '>',
00889 '"' => '"',
00890 '{' => '{',
00891 '[' => '[',
00892 "''" => '''',
00893 'ISBN' => 'ISBN',
00894 'RFC' => 'RFC',
00895 'PMID' => 'PMID',
00896 '|' => '|',
00897 '__' => '__',
00898 ) );
00899
00900 # Stupid hack
00901 $encValue = preg_replace_callback(
00902 '/(' . wfUrlProtocols() . ')/',
00903 array( 'Sanitizer', 'armorLinksCallback' ),
00904 $encValue );
00905 return $encValue;
00906 }
00907
00937 static function escapeId( $id, $options = array() ) {
00938 global $wgHtml5, $wgExperimentalHtmlIds;
00939 $options = (array)$options;
00940
00941 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
00942 $id = Sanitizer::decodeCharReferences( $id );
00943 $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
00944 $id = trim( $id, '_' );
00945 if ( $id === '' ) {
00946 # Must have been all whitespace to start with.
00947 return '_';
00948 } else {
00949 return $id;
00950 }
00951 }
00952
00953 # HTML4-style escaping
00954 static $replace = array(
00955 '%3A' => ':',
00956 '%' => '.'
00957 );
00958
00959 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
00960 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
00961
00962 if ( !preg_match( '/^[a-zA-Z]/', $id )
00963 && !in_array( 'noninitial', $options ) ) {
00964
00965 $id = "x$id";
00966 }
00967 return $id;
00968 }
00969
00981 static function escapeClass( $class ) {
00982
00983 return rtrim(preg_replace(
00984 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
00985 '_',
00986 $class ), '_');
00987 }
00988
00996 static function escapeHtmlAllowEntities( $html ) {
00997 # It seems wise to escape ' as well as ", as a matter of course. Can't
00998 # hurt.
00999 $html = htmlspecialchars( $html, ENT_QUOTES );
01000 $html = str_replace( '&', '&', $html );
01001 $html = Sanitizer::normalizeCharReferences( $html );
01002 return $html;
01003 }
01004
01010 private static function armorLinksCallback( $matches ) {
01011 return str_replace( ':', ':', $matches[1] );
01012 }
01013
01022 public static function decodeTagAttributes( $text ) {
01023 if( trim( $text ) == '' ) {
01024 return array();
01025 }
01026
01027 $attribs = array();
01028 $pairs = array();
01029 if( !preg_match_all(
01030 MW_ATTRIBS_REGEX,
01031 $text,
01032 $pairs,
01033 PREG_SET_ORDER ) ) {
01034 return $attribs;
01035 }
01036
01037 foreach( $pairs as $set ) {
01038 $attribute = strtolower( $set[1] );
01039 $value = Sanitizer::getTagAttributeCallback( $set );
01040
01041
01042 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01043 $value = trim( $value );
01044
01045
01046 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01047 }
01048 return $attribs;
01049 }
01050
01058 private static function getTagAttributeCallback( $set ) {
01059 if( isset( $set[6] ) ) {
01060 # Illegal #XXXXXX color with no quotes.
01061 return $set[6];
01062 } elseif( isset( $set[5] ) ) {
01063 # No quotes.
01064 return $set[5];
01065 } elseif( isset( $set[4] ) ) {
01066 # Single-quoted
01067 return $set[4];
01068 } elseif( isset( $set[3] ) ) {
01069 # Double-quoted
01070 return $set[3];
01071 } elseif( !isset( $set[2] ) ) {
01072 # In XHTML, attributes must have a value.
01073 # For 'reduced' form, return explicitly the attribute name here.
01074 return $set[1];
01075 } else {
01076 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01077 }
01078 }
01079
01091 private static function normalizeAttributeValue( $text ) {
01092 return str_replace( '"', '"',
01093 self::normalizeWhitespace(
01094 Sanitizer::normalizeCharReferences( $text ) ) );
01095 }
01096
01097 private static function normalizeWhitespace( $text ) {
01098 return preg_replace(
01099 '/\r\n|[\x20\x0d\x0a\x09]/',
01100 ' ',
01101 $text );
01102 }
01103
01118 static function normalizeCharReferences( $text ) {
01119 return preg_replace_callback(
01120 MW_CHAR_REFS_REGEX,
01121 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01122 $text );
01123 }
01128 static function normalizeCharReferencesCallback( $matches ) {
01129 $ret = null;
01130 if( $matches[1] != '' ) {
01131 $ret = Sanitizer::normalizeEntity( $matches[1] );
01132 } elseif( $matches[2] != '' ) {
01133 $ret = Sanitizer::decCharReference( $matches[2] );
01134 } elseif( $matches[3] != '' ) {
01135 $ret = Sanitizer::hexCharReference( $matches[3] );
01136 } elseif( $matches[4] != '' ) {
01137 $ret = Sanitizer::hexCharReference( $matches[4] );
01138 }
01139 if( is_null( $ret ) ) {
01140 return htmlspecialchars( $matches[0] );
01141 } else {
01142 return $ret;
01143 }
01144 }
01145
01155 static function normalizeEntity( $name ) {
01156 global $wgHtmlEntities, $wgHtmlEntityAliases;
01157 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01158 return "&{$wgHtmlEntityAliases[$name]};";
01159 } elseif( isset( $wgHtmlEntities[$name] ) ) {
01160 return "&$name;";
01161 } else {
01162 return "&$name;";
01163 }
01164 }
01165
01166 static function decCharReference( $codepoint ) {
01167 $point = intval( $codepoint );
01168 if( Sanitizer::validateCodepoint( $point ) ) {
01169 return sprintf( '&#%d;', $point );
01170 } else {
01171 return null;
01172 }
01173 }
01174
01175 static function hexCharReference( $codepoint ) {
01176 $point = hexdec( $codepoint );
01177 if( Sanitizer::validateCodepoint( $point ) ) {
01178 return sprintf( '&#x%x;', $point );
01179 } else {
01180 return null;
01181 }
01182 }
01183
01189 private static function validateCodepoint( $codepoint ) {
01190 return ($codepoint == 0x09)
01191 || ($codepoint == 0x0a)
01192 || ($codepoint == 0x0d)
01193 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
01194 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
01195 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01196 }
01197
01205 public static function decodeCharReferences( $text ) {
01206 return preg_replace_callback(
01207 MW_CHAR_REFS_REGEX,
01208 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01209 $text );
01210 }
01211
01216 static function decodeCharReferencesCallback( $matches ) {
01217 if( $matches[1] != '' ) {
01218 return Sanitizer::decodeEntity( $matches[1] );
01219 } elseif( $matches[2] != '' ) {
01220 return Sanitizer::decodeChar( intval( $matches[2] ) );
01221 } elseif( $matches[3] != '' ) {
01222 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01223 } elseif( $matches[4] != '' ) {
01224 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
01225 }
01226 # Last case should be an ampersand by itself
01227 return $matches[0];
01228 }
01229
01237 static function decodeChar( $codepoint ) {
01238 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01239 return codepointToUtf8( $codepoint );
01240 } else {
01241 return UTF8_REPLACEMENT;
01242 }
01243 }
01244
01253 static function decodeEntity( $name ) {
01254 global $wgHtmlEntities, $wgHtmlEntityAliases;
01255 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01256 $name = $wgHtmlEntityAliases[$name];
01257 }
01258 if( isset( $wgHtmlEntities[$name] ) ) {
01259 return codepointToUtf8( $wgHtmlEntities[$name] );
01260 } else {
01261 return "&$name;";
01262 }
01263 }
01264
01271 static function attributeWhitelist( $element ) {
01272 static $list;
01273 if( !isset( $list ) ) {
01274 $list = Sanitizer::setupAttributeWhitelist();
01275 }
01276 return isset( $list[$element] )
01277 ? $list[$element]
01278 : array();
01279 }
01280
01286 static function setupAttributeWhitelist() {
01287 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01288
01289 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01290
01291 if ( $wgAllowRdfaAttributes ) {
01292 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01293 $common = array_merge( $common, array(
01294 'about', 'property', 'resource', 'datatype', 'typeof',
01295 ) );
01296 }
01297
01298 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01299 # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01300 $common = array_merge( $common, array(
01301 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01302 ) );
01303 }
01304
01305 $block = array_merge( $common, array( 'align' ) );
01306 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01307 $tablecell = array( 'abbr',
01308 'axis',
01309 'headers',
01310 'scope',
01311 'rowspan',
01312 'colspan',
01313 'nowrap', # deprecated
01314 'width', # deprecated
01315 'height', # deprecated
01316 'bgcolor' # deprecated
01317 );
01318
01319 # Numbers refer to sections in HTML 4.01 standard describing the element.
01320 # See: http://www.w3.org/TR/html4/
01321 $whitelist = array (
01322 # 7.5.4
01323 'div' => $block,
01324 'center' => $common, # deprecated
01325 'span' => $block, # ??
01326
01327 # 7.5.5
01328 'h1' => $block,
01329 'h2' => $block,
01330 'h3' => $block,
01331 'h4' => $block,
01332 'h5' => $block,
01333 'h6' => $block,
01334
01335 # 7.5.6
01336 # address
01337
01338 # 8.2.4
01339 # bdo
01340
01341 # 9.2.1
01342 'em' => $common,
01343 'strong' => $common,
01344 'cite' => $common,
01345 # dfn
01346 'code' => $common,
01347 # samp
01348 # kbd
01349 'var' => $common,
01350 'abbr' => $common,
01351 # acronym
01352
01353 # 9.2.2
01354 'blockquote' => array_merge( $common, array( 'cite' ) ),
01355 # q
01356
01357 # 9.2.3
01358 'sub' => $common,
01359 'sup' => $common,
01360
01361 # 9.3.1
01362 'p' => $block,
01363
01364 # 9.3.2
01365 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
01366
01367 # 9.3.4
01368 'pre' => array_merge( $common, array( 'width' ) ),
01369
01370 # 9.4
01371 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
01372 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
01373
01374 # 10.2
01375 'ul' => array_merge( $common, array( 'type' ) ),
01376 'ol' => array_merge( $common, array( 'type', 'start' ) ),
01377 'li' => array_merge( $common, array( 'type', 'value' ) ),
01378
01379 # 10.3
01380 'dl' => $common,
01381 'dd' => $common,
01382 'dt' => $common,
01383
01384 # 11.2.1
01385 'table' => array_merge( $common,
01386 array( 'summary', 'width', 'border', 'frame',
01387 'rules', 'cellspacing', 'cellpadding',
01388 'align', 'bgcolor',
01389 ) ),
01390
01391 # 11.2.2
01392 'caption' => array_merge( $common, array( 'align' ) ),
01393
01394 # 11.2.3
01395 'thead' => array_merge( $common, $tablealign ),
01396 'tfoot' => array_merge( $common, $tablealign ),
01397 'tbody' => array_merge( $common, $tablealign ),
01398
01399 # 11.2.4
01400 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01401 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01402
01403 # 11.2.5
01404 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01405
01406 # 11.2.6
01407 'td' => array_merge( $common, $tablecell, $tablealign ),
01408 'th' => array_merge( $common, $tablecell, $tablealign ),
01409
01410 # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01411 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01412
01413 # 13.2
01414 # Not usually allowed, but may be used for extension-style hooks
01415 # such as <math> when it is rasterized
01416 'img' => array_merge( $common, array( 'alt' ) ),
01417
01418 # 15.2.1
01419 'tt' => $common,
01420 'b' => $common,
01421 'i' => $common,
01422 'big' => $common,
01423 'small' => $common,
01424 'strike' => $common,
01425 's' => $common,
01426 'u' => $common,
01427
01428 # 15.2.2
01429 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
01430 # basefont
01431
01432 # 15.3
01433 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01434
01435 # XHTML Ruby annotation text module, simple ruby only.
01436 # http:
01437 'ruby' => $common,
01438 # rbc
01439 # rtc
01440 'rb' => $common,
01441 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
01442 'rp' => $common,
01443
01444 # MathML root element, where used for extensions
01445 # 'title' may not be 100% valid here; it's XHTML
01446 # http://www.w3.org/TR/REC-MathML/
01447 'math' => array( 'class', 'style', 'id', 'title' ),
01448 );
01449 return $whitelist;
01450 }
01451
01462 static function stripAllTags( $text ) {
01463 # Actual <tags>
01464 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01465
01466 # Normalize &entities and whitespace
01467 $text = self::decodeCharReferences( $text );
01468 $text = self::normalizeWhitespace( $text );
01469
01470 return $text;
01471 }
01472
01482 static function hackDocType() {
01483 global $wgHtmlEntities;
01484 $out = "<!DOCTYPE html [\n";
01485 foreach( $wgHtmlEntities as $entity => $codepoint ) {
01486 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01487 }
01488 $out .= "]>\n";
01489 return $out;
01490 }
01491
01492 static function cleanUrl( $url ) {
01493 # Normalize any HTML entities in input. They will be
01494 # re-escaped by makeExternalLink().
01495 $url = Sanitizer::decodeCharReferences( $url );
01496
01497 # Escape any control characters introduced by the above step
01498 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
01499
01500 # Validate hostname portion
01501 $matches = array();
01502 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01503 list( /* $whole */, $protocol, $host, $rest ) = $matches;
01504
01505 // Characters that will be ignored in IDNs.
01506 // http://tools.ietf.org/html/3454#section-3.1
01507 // Strip them before further processing so blacklists and such work.
01508 $strip = "/
01509 \\s| # general whitespace
01510 \xc2\xad| # 00ad SOFT HYPHEN
01511 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01512 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01513 \xe2\x81\xa0| # 2060 WORD JOINER
01514 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01515 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
01516 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01517 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01518 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01519 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01520 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01521 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
01522 /xuD";
01523
01524 $host = preg_replace( $strip, '', $host );
01525
01526 // @todo Fixme: validate hostnames here
01527
01528 return $protocol . $host . $rest;
01529 } else {
01530 return $url;
01531 }
01532 }
01533
01534 }