00001
00002
00003
00004 import tarfile, zipfile
00005 import os, re, shutil, sys, platform
00006
00007 pyversion = platform.python_version()
00008 islinux = platform.system().lower() == 'linux' or False
00009
00010 if pyversion[:3] in ['2.5', '2.6', '2.7']:
00011 import urllib as urllib_request
00012 import codecs
00013 uniopen = codecs.open
00014 def unichr2(i):
00015 if sys.maxunicode >= 0x10000 or i < 0x10000:
00016 return unichr(i)
00017 else:
00018 return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF))
00019 elif pyversion[:2] == '3.':
00020 import urllib.request as urllib_request
00021 uniopen = open
00022 unichr2 = chr
00023
00024
00025 SF_MIRROR = 'easynews'
00026 SCIM_TABLES_VER = '0.5.9'
00027 SCIM_PINYIN_VER = '0.5.91'
00028 LIBTABE_VER = '0.2.3'
00029
00030
00031 def GetFileFromURL( url, dest ):
00032 if os.path.isfile(dest):
00033 print( 'File %s up to date.' % dest )
00034 return
00035 global islinux
00036 if islinux:
00037
00038
00039 os.system('wget %s' % url)
00040 else:
00041 print( 'Downloading from [%s] ...' % url )
00042 urllib_request.urlretrieve( url, dest )
00043 print( 'Download complete.\n' )
00044 return
00045
00046 def GetFileFromUnihan( path ):
00047 print( 'Extracting files from %s ...' % path )
00048 text = zipfile.ZipFile(path).read('Unihan_Variants.txt')
00049 uhfile = uniopen('Unihan_Variants.txt', 'w')
00050 uhfile.write(text)
00051 uhfile.close()
00052 return
00053
00054 def GetFileFromTar( path, member, rename ):
00055 print( 'Extracting %s from %s ...' % (rename, path) )
00056 tarfile.open(path, 'r:gz').extract(member)
00057 shutil.move(member, rename)
00058 tree_rmv = member.split('/')[0]
00059 shutil.rmtree(tree_rmv)
00060 return
00061
00062 def ReadBIG5File( dest ):
00063 print( 'Reading and decoding %s ...' % dest )
00064 f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' )
00065 text = f1.read()
00066 text = text.replace( '\ufffd', '\n' )
00067 f1.close()
00068 f2 = uniopen( dest, 'w', encoding='utf8' )
00069 f2.write(text)
00070 f2.close()
00071 return text
00072
00073 def ReadFile( dest ):
00074 print( 'Reading and decoding %s ...' % dest )
00075 f = uniopen( dest, 'r', encoding='utf8' )
00076 ret = f.read()
00077 f.close()
00078 return ret
00079
00080 def ReadUnihanFile( dest ):
00081 print( 'Reading and decoding %s ...' % dest )
00082 f = uniopen( dest, 'r', encoding='utf8' )
00083 t2s_code = []
00084 s2t_code = []
00085 while True:
00086 line = f.readline()
00087 if line:
00088 if line.startswith('#'):
00089 continue
00090 elif not line.find('kSimplifiedVariant') == -1:
00091 temp = line.split('kSimplifiedVariant')
00092 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
00093 elif not line.find('kTraditionalVariant') == -1:
00094 temp = line.split('kTraditionalVariant')
00095 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
00096 else:
00097 break
00098 f.close()
00099 return ( t2s_code, s2t_code )
00100
00101 def RemoveRows( text, num ):
00102 text = re.sub( '.*\s*', '', text, num)
00103 return text
00104
00105 def RemoveOneCharConv( text ):
00106 preg = re.compile('^.\s*$', re.MULTILINE)
00107 text = preg.sub( '', text )
00108 return text
00109
00110 def ConvertToChar( code ):
00111 code = code.split('<')[0]
00112 return unichr2( int( code[2:], 16 ) )
00113
00114 def GetDefaultTable( code_table ):
00115 char_table = {}
00116 for ( f, t ) in code_table:
00117 if f and t:
00118 from_char = ConvertToChar( f )
00119 to_chars = [ConvertToChar( code ) for code in t.split()]
00120 char_table[from_char] = to_chars
00121 return char_table
00122
00123 def GetManualTable( dest ):
00124 text = ReadFile( dest )
00125 temp1 = text.split()
00126 char_table = {}
00127 for elem in temp1:
00128 elem = elem.strip('|')
00129 if elem:
00130 temp2 = elem.split( '|', 1 )
00131 from_char = unichr2( int( temp2[0][2:7], 16 ) )
00132 to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
00133 char_table[from_char] = to_chars
00134 return char_table
00135
00136 def GetValidTable( src_table ):
00137 valid_table = {}
00138 for f, t in src_table.items():
00139 valid_table[f] = t[0]
00140 return valid_table
00141
00142 def GetToManyRules( src_table ):
00143 tomany_table = {}
00144 for f, t in src_table.items():
00145 for i in range(1, len(t)):
00146 tomany_table[t[i]] = True
00147 return tomany_table
00148
00149 def RemoveRules( dest, table ):
00150 text = ReadFile( dest )
00151 temp1 = text.split()
00152 for elem in temp1:
00153 f = ''
00154 t = ''
00155 elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
00156 if '=>' in elem:
00157 if elem.startswith( '=>' ):
00158 t = elem.replace( '=>', '' ).strip()
00159 elif elem.endswith( '=>' ):
00160 f = elem.replace( '=>', '' ).strip()
00161 else:
00162 temp2 = elem.split( '=>' )
00163 f = temp2[0].strip()
00164 t = temp2[1].strip()
00165 try:
00166 table.pop(f, t)
00167 continue
00168 except:
00169 continue
00170 else:
00171 f = t = elem
00172 if f:
00173 try:
00174 table.pop(f)
00175 except:
00176 x = 1
00177 if t:
00178 for temp_f, temp_t in table.copy().items():
00179 if temp_t == t:
00180 table.pop(temp_f)
00181 return table
00182
00183 def DictToSortedList1( src_table ):
00184 return sorted( src_table.items(), key = lambda m: m[0] )
00185
00186 def DictToSortedList2( src_table ):
00187 return sorted( src_table.items(), key = lambda m: m[1] )
00188
00189 def Converter( string, conv_table ):
00190 i = 0
00191 while i < len(string):
00192 for j in range(len(string) - i, 0, -1):
00193 f = string[i:][:j]
00194 t = conv_table.get( f )
00195 if t:
00196 string = string[:i] + t + string[i:][j:]
00197 i += len(t) - 1
00198 break
00199 i += 1
00200 return string
00201
00202 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
00203 wordlist = list( set( src_wordlist ) )
00204 wordlist.sort( key = len, reverse = True )
00205 word_conv_table = {}
00206 word_reconv_table = {}
00207 while wordlist:
00208 conv_table = {}
00209 reconv_table = {}
00210 conv_table.update( word_conv_table )
00211 conv_table.update( char_conv_table )
00212 reconv_table.update( word_reconv_table )
00213 reconv_table.update( char_reconv_table )
00214 word = wordlist.pop()
00215 new_word_len = word_len = len(word)
00216 while new_word_len == word_len:
00217 rvt_test = False
00218 for char in word:
00219 rvt_test = rvt_test or src_tomany.get(char)
00220 test_word = Converter( word, reconv_table )
00221 new_word = Converter( word, conv_table )
00222 if not reconv_table.get( new_word ):
00223 if not test_word == word:
00224 word_conv_table[word] = new_word
00225 word_reconv_table[new_word] = word
00226 elif rvt_test:
00227 rvt_word = Converter( new_word, reconv_table )
00228 if not rvt_word == word:
00229 word_conv_table[word] = new_word
00230 word_reconv_table[new_word] = word
00231 try:
00232 word = wordlist.pop()
00233 except IndexError:
00234 break
00235 new_word_len = len(word)
00236 return word_reconv_table
00237
00238 def GetManualWordsTable( src_wordlist, conv_table ):
00239 src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
00240 wordlist = list( set( src_wordlist ) )
00241 wordlist.sort( key = len, reverse = True )
00242 reconv_table = {}
00243 while wordlist:
00244 word = wordlist.pop()
00245 new_word = Converter( word, conv_table )
00246 reconv_table[new_word] = word
00247 return reconv_table
00248
00249 def CustomRules( dest ):
00250 text = ReadFile( dest )
00251 temp = text.split()
00252 ret = dict()
00253 for i in range( 0, len( temp ), 2 ):
00254 ret[temp[i]] = temp[i + 1]
00255 return ret
00256
00257 def GetPHPArray( table ):
00258 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
00259
00260 return '\n'.join(lines)
00261
00262 def RemoveSameChar( src_table ):
00263 dst_table = {}
00264 for f, t in src_table.items():
00265 if f != t:
00266 dst_table[f] = t
00267 return dst_table
00268
00269 def main():
00270
00271 url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
00272 han_dest = 'Unihan.zip'
00273 GetFileFromURL( url, han_dest )
00274
00275
00276 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
00277 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
00278 GetFileFromURL( url, tbe_dest )
00279
00280
00281 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
00282 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
00283 GetFileFromURL( url, pyn_dest )
00284
00285
00286 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
00287 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
00288 GetFileFromURL( url, lbt_dest )
00289
00290
00291
00292
00293 GetFileFromUnihan( han_dest )
00294
00295
00296 t_wordlist = []
00297 s_wordlist = []
00298
00299
00300 src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
00301 dst = 'EZ.txt.in'
00302 GetFileFromTar( tbe_dest, src, dst )
00303 text = ReadFile( dst )
00304 text = text.split( 'BEGIN_TABLE' )[1].strip()
00305 text = text.split( 'END_TABLE' )[0].strip()
00306 text = re.sub( '.*\t', '', text )
00307 text = RemoveOneCharConv( text )
00308 t_wordlist.extend( text.split() )
00309
00310
00311 src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
00312 dst = 'Wubi.txt.in'
00313 GetFileFromTar( tbe_dest, src, dst )
00314 text = ReadFile( dst )
00315 text = text.split( 'BEGIN_TABLE' )[1].strip()
00316 text = text.split( 'END_TABLE' )[0].strip()
00317 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
00318 text = RemoveOneCharConv( text )
00319 s_wordlist.extend( text.split() )
00320
00321
00322 src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
00323 dst = 'Ziranma.txt.in'
00324 GetFileFromTar( tbe_dest, src, dst )
00325 text = ReadFile( dst )
00326 text = text.split( 'BEGIN_TABLE' )[1].strip()
00327 text = text.split( 'END_TABLE' )[0].strip()
00328 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
00329 text = RemoveOneCharConv( text )
00330 s_wordlist.extend( text.split() )
00331
00332
00333 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
00334 dst = 'phrase_lib.txt'
00335 GetFileFromTar( pyn_dest, src, dst )
00336 text = ReadFile( 'phrase_lib.txt' )
00337 text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
00338 text = RemoveRows( text, 5 )
00339 text = RemoveOneCharConv( text )
00340 s_wordlist.extend( text.split() )
00341
00342
00343 src = 'libtabe/tsi-src/tsi.src'
00344 dst = 'tsi.src'
00345 GetFileFromTar( lbt_dest, src, dst )
00346 text = ReadBIG5File( 'tsi.src' )
00347 text = re.sub( ' \d.*', '', text.replace('# ', ''))
00348 text = RemoveOneCharConv( text )
00349 t_wordlist.extend( text.split() )
00350
00351
00352 t_wordlist = list( set( t_wordlist ) )
00353 s_wordlist = list( set( s_wordlist ) )
00354
00355
00356 text = ReadFile( 'simpphrases_exclude.manual' )
00357 temp = text.split()
00358 s_string = '\n'.join( s_wordlist )
00359 for elem in temp:
00360 s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
00361 s_wordlist = s_string.split('\n')
00362
00363
00364 text = ReadFile( 'tradphrases_exclude.manual' )
00365 temp = text.split()
00366 t_string = '\n'.join( t_wordlist )
00367 for elem in temp:
00368 t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
00369 t_wordlist = t_string.split('\n')
00370
00371
00372
00373 ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' )
00374
00375 t2s_1tomany = {}
00376 t2s_1tomany.update( GetDefaultTable( t2s_code ) )
00377 t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
00378
00379 s2t_1tomany = {}
00380 s2t_1tomany.update( GetDefaultTable( s2t_code ) )
00381 s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
00382
00383 t2s_1to1 = GetValidTable( t2s_1tomany )
00384 s_tomany = GetToManyRules( t2s_1tomany )
00385
00386 s2t_1to1 = GetValidTable( s2t_1tomany )
00387 t_tomany = GetToManyRules( s2t_1tomany )
00388
00389 t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
00390 s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
00391
00392
00393 t2s_1to1_supp = t2s_1to1.copy()
00394 s2t_1to1_supp = s2t_1to1.copy()
00395
00396 t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
00397
00398 s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
00399
00400 text = ReadFile( 'simpphrases.manual' )
00401 s_wordlist_manual = text.split('\n')
00402 t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
00403 t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
00404
00405 text = ReadFile( 'tradphrases.manual' )
00406 t_wordlist_manual = text.split('\n')
00407 s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
00408 s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
00409
00410 s2t_supp = s2t_1to1_supp.copy()
00411 s2t_supp.update( s2t_word2word_manual )
00412 t2s_supp = t2s_1to1_supp.copy()
00413 t2s_supp.update( t2s_word2word_manual )
00414 t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
00415
00416 t2s_word2word.update( t2s_word2word_manual )
00417
00418 s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
00419
00420 s2t_word2word.update( s2t_word2word_manual )
00421
00422
00423
00424 t2s_1to1 = RemoveSameChar( t2s_1to1 )
00425 s2t_1to1 = RemoveSameChar( s2t_1to1 )
00426 toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
00427
00428 toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
00429
00430 toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
00431
00432 toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
00433
00434 toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
00435
00436 toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
00437
00438
00439 php = '''<?php
00440 /**
00441 * Simplified / Traditional Chinese conversion tables
00442 *
00443 * Automatically generated using code and data in includes/zhtable/
00444 * Do not modify directly!
00445 */
00446
00447 $zh2Hant = array(\n'''
00448 php += GetPHPArray( toHant )
00449 php += '\n);\n\n$zh2Hans = array(\n'
00450 php += GetPHPArray( toHans )
00451 php += '\n);\n\n$zh2TW = array(\n'
00452 php += GetPHPArray( toTW )
00453 php += '\n);\n\n$zh2HK = array(\n'
00454 php += GetPHPArray( toHK )
00455 php += '\n);\n\n$zh2CN = array(\n'
00456 php += GetPHPArray( toCN )
00457 php += '\n);\n\n$zh2SG = array(\n'
00458 php += GetPHPArray( toSG )
00459 php += '\n);'
00460
00461 f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' )
00462 print ('Writing ZhConversion.php ... ')
00463 f.write( php )
00464 f.close()
00465
00466
00467 print ('Deleting temp files ... ')
00468 os.remove('EZ.txt.in')
00469 os.remove('phrase_lib.txt')
00470 os.remove('tsi.src')
00471 os.remove('Unihan_Variants.txt')
00472 os.remove('Wubi.txt.in')
00473 os.remove('Ziranma.txt.in')
00474
00475
00476 if __name__ == '__main__':
00477 main()