00001 <?php
00002
00003 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
00004
00005 require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' );
00006
00011 class GenerateNormalizerData extends Maintenance {
00012 var $dataFile;
00013
00014 public function __construct() {
00015 parent::__construct();
00016 $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
00017 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
00018 }
00019
00020 public function execute() {
00021 if ( !$this->hasOption( 'unicode-data-file' ) ) {
00022 $this->dataFile = 'UnicodeData.txt';
00023 if ( !file_exists( $this->dataFile ) ) {
00024 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
00025 exit( 1 );
00026 }
00027 } else {
00028 $this->dataFile = $this->getOption( 'unicode-data-file' );
00029 if ( !file_exists( $this->dataFile ) ) {
00030 $this->error( 'Unable to find the specified data file.' );
00031 exit( 1 );
00032 }
00033 }
00034
00035 $this->generateArabic();
00036 $this->generateMalayalam();
00037 }
00038
00039 function generateArabic() {
00040 $file = fopen( $this->dataFile, 'r' );
00041 if ( !$file ) {
00042 $this->error( 'Unable to open the data file.' );
00043 exit( 1 );
00044 }
00045
00046
00047 $fieldNames = array(
00048 'Code',
00049 'Name',
00050 'General_Category',
00051 'Canonical_Combining_Class',
00052 'Bidi_Class',
00053 'Decomposition_Type_Mapping',
00054 'Numeric_Type_Value',
00055 'Bidi_Mirrored',
00056 'Unicode_1_Name',
00057 'ISO_Comment',
00058 'Simple_Uppercase_Mapping',
00059 'Simple_Lowercase_Mapping',
00060 'Simple_Titlecase_Mapping'
00061 );
00062
00063 $pairs = array();
00064
00065 $lineNum = 0;
00066 while ( false !== ( $line = fgets( $file ) ) ) {
00067 ++$lineNum;
00068
00069 # Strip comments
00070 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
00071 if ( $line === '' ) {
00072 continue;
00073 }
00074
00075 # Split fields
00076 $numberedData = explode( ';', $line );
00077 $data = array();
00078 foreach ( $fieldNames as $number => $name ) {
00079 $data[$name] = $numberedData[$number];
00080 }
00081
00082 $code = base_convert( $data['Code'], 16, 10 );
00083 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
00084 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
00085 {
00086 if ( $data['Decomposition_Type_Mapping'] === '' ) {
00087
00088 continue;
00089 }
00090 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
00091 $data['Decomposition_Type_Mapping'], $m ) )
00092 {
00093 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
00094 $this->error( $line );
00095 continue;
00096 }
00097
00098 $source = hexSequenceToUtf8( $data['Code'] );
00099 $dest = hexSequenceToUtf8( $m[2] );
00100 $pairs[$source] = $dest;
00101 }
00102 }
00103
00104 global $IP;
00105 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
00106 echo "ar: " . count( $pairs ) . " pairs written.\n";
00107 }
00108
00109 function generateMalayalam() {
00110 $hexPairs = array(
00111 # From http:
00112 '0D23 0D4D 200D' => '0D7A',
00113 '0D28 0D4D 200D' => '0D7B',
00114 '0D30 0D4D 200D' => '0D7C',
00115 '0D32 0D4D 200D' => '0D7D',
00116 '0D33 0D4D 200D' => '0D7E',
00117
00118 # From http:
00119 '0D15 0D4D 200D' => '0D7F',
00120 );
00121
00122 $pairs = array();
00123 foreach ( $hexPairs as $hexSource => $hexDest ) {
00124 $source = hexSequenceToUtf8( $hexSource );
00125 $dest = hexSequenceToUtf8( $hexDest );
00126 $pairs[$source] = $dest;
00127 }
00128
00129 global $IP;
00130 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
00131 echo "ml: " . count( $pairs ) . " pairs written.\n";
00132 }
00133 }
00134
00135 $maintClass = 'GenerateNormalizerData';
00136 require_once( DO_MAINTENANCE );
00137