#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals __license__ = """ This file is part of GNU FreeFont. GNU FreeFont is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. GNU FreeFont is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU FreeFont. If not, see . """ __author__ = "Emmanuel Vallois" __email__ = "vallois@polytech.unice.fr" __copyright__ = "Copyright 2011 Emmanuel Vallois" __date__ = "$Date$" __version__ = "$Revision$" __doc__ = """ Writes in the file named by the first argument an HTML page comprising a table for testing arabic characters, their behavior and consistency with presentation forms. Runs under normal Python, version 2.7 or above. Typical usage: arabic_test.py "Arabic test page.html" """ import sys from codecs import open from string import Template from io import StringIO from unicodedata import normalize, name, unidata_version, decomposition _module_missing_msg = """Please run generate_arabic_shaping.py to generate arabic_shaping.py""" try: from arabic_shaping import joining_type except: print( _module_missing_msg, file=sys.stderr) sys.exit( 1 ) if len(sys.argv) > 1: outfile = sys.argv[1] else: outfile = 'Arabic test page.html' sys.stdout = open(outfile, 'w', 'utf-8') def uniname(char): return name(char, new_names.get(char, "<reserved-{:04X}>".format(ord(char)))) def non_positional_name(char): return uniname(char).replace(' INITIAL','').replace(' FINAL','').replace(' MEDIAL','').replace(' ISOLATED','').replace(' FORM','') arabic_ranges = list(range(0x600, 0x61B + 1)) arabic_ranges.extend(range(0x61E, 0x6FF + 1)) arabic_ranges.extend(range(0x750, 0x77F + 1)) arabic_ranges.extend(range(0x8A0, 0x8B1 + 1)) arabic_ranges.extend(range(0x8E4, 0x8FF + 1)) arabic_ranges.extend(range(0xFB50, 0xFBC1 + 1)) arabic_ranges.extend(range(0xFBD3, 0xFD3F + 1)) arabic_ranges.extend(range(0xFD50, 0xFD8F + 1)) arabic_ranges.extend(range(0xFD92, 0xFDC7 + 1)) arabic_ranges.extend(range(0xFDF0, 0xFDFD + 1)) arabic_ranges.extend(range(0xFE70, 0xFE74 + 1)) arabic_ranges.extend(range(0xFE76, 0xFEFC + 1)) unicode61_new_ranges = [0x604, 0x8A0] unicode61_new_ranges.extend(range(0x8A2, 0x8AC + 1)) unicode61_new_ranges.extend(range(0x8E4, 0x8FE + 1)) unicode62_new_ranges = [0x605, 0x8A1] unicode62_new_ranges.extend(range(0x8AD, 0x8B1 + 1)) unicode62_new_ranges.append(0x8FF) new_names = {} new_names['\u0604'] = 'ARABIC SIGN SAMVAT' new_names['\u0605'] = 'ARABIC NUMBER MARK ABOVE' new_names['\u08A0'] = 'ARABIC LETTER BEH WITH SMALL V BELOW' new_names['\u08A1'] = 'ARABIC LETTER BEH WITH HAMZA ABOVE' new_names['\u08A2'] = 'ARABIC LETTER JEEM WITH TWO DOTS ABOVE' new_names['\u08A3'] = 'ARABIC LETTER TAH WITH TWO DOTS ABOVE' new_names['\u08A4'] = 'ARABIC LETTER FEH WITH DOT BELOW AND THREE DOTS ABOVE' new_names['\u08A5'] = 'ARABIC LETTER QAF WITH DOT BELOW' new_names['\u08A6'] = 'ARABIC LETTER LAM WITH DOUBLE BAR' new_names['\u08A7'] = 'ARABIC LETTER MEEM WITH THREE DOTS ABOVE' new_names['\u08A8'] = 'ARABIC LETTER YEH WITH TWO DOTS BELOW AND HAMZA ABOVE' new_names['\u08A9'] = 'ARABIC LETTER YEH WITH TWO DOTS BELOW AND DOT ABOVE' new_names['\u08AA'] = 'ARABIC LETTER REH WITH LOOP' new_names['\u08AB'] = 'ARABIC LETTER WAW WITH DOT WITHIN' new_names['\u08AC'] = 'ARABIC LETTER ROHINGYA YEH' new_names['\u08E4'] = 'ARABIC CURLY FATHA' new_names['\u08E5'] = 'ARABIC CURLY DAMMA' new_names['\u08E6'] = 'ARABIC CURLY KASRA' new_names['\u08E7'] = 'ARABIC CURLY FATHATAN' new_names['\u08E8'] = 'ARABIC CURLY DAMMATAN' new_names['\u08E9'] = 'ARABIC CURLY KASRATAN' new_names['\u08EA'] = 'ARABIC TONE ONE DOT ABOVE' new_names['\u08EB'] = 'ARABIC TONE TWO DOTS ABOVE' new_names['\u08EC'] = 'ARABIC TONE LOOP ABOVE' new_names['\u08ED'] = 'ARABIC TONE ONE DOT BELOW' new_names['\u08EE'] = 'ARABIC TONE TWO DOTS BELOW' new_names['\u08EF'] = 'ARABIC TONE LOOP BELOW' new_names['\u08F0'] = 'ARABIC OPEN FATHATAN' new_names['\u08F1'] = 'ARABIC OPEN DAMMATAN' new_names['\u08F2'] = 'ARABIC OPEN KASRATAN' new_names['\u08F3'] = 'ARABIC SMALL HIGH WAW' new_names['\u08F4'] = 'ARABIC FATHA WITH RING' new_names['\u08F5'] = 'ARABIC FATHA WITH DOT ABOVE' new_names['\u08F6'] = 'ARABIC KASRA WITH DOT BELOW' new_names['\u08F7'] = 'ARABIC LEFT ARROWHEAD ABOVE' new_names['\u08F8'] = 'ARABIC RIGHT ARROWHEAD ABOVE' new_names['\u08F9'] = 'ARABIC LEFT ARROWHEAD BELOW' new_names['\u08FA'] = 'ARABIC RIGHT ARROWHEAD BELOW' new_names['\u08FB'] = 'ARABIC DOUBLE RIGHT ARROWHEAD ABOVE' new_names['\u08FC'] = 'ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT' new_names['\u08FD'] = 'ARABIC RIGHT ARROWHEAD ABOVE WITH DOT' new_names['\u08FE'] = 'ARABIC DAMMA WITH DOT' new_names['\u08AD'] = 'ARABIC LETTER LOW ALEF' new_names['\u08AE'] = 'ARABIC LETTER DAL WITH THREE DOTS BELOW' new_names['\u08AF'] = 'ARABIC LETTER SAD WITH THREE DOTS BELOW' new_names['\u08B0'] = 'ARABIC LETTER GAF WITH INVERTED STROKE' new_names['\u08B1'] = 'ARABIC LETTER STRAIGHT WAW' new_names['\u08FF'] = 'ARABIC MARK SIDEWAYS NOON GHUNNA' # Unicode 6.0 additions not present in Python 2.7 new_names['\u0620'] = 'ARABIC LETTER KASHMIRI YEH' new_names['\u065F'] = 'ARABIC WAVY HAMZA BELOW' new_names['\uFBB2'] = 'ARABIC SYMBOL DOT ABOVE' new_names['\uFBB3'] = 'ARABIC SYMBOL DOT BELOW' new_names['\uFBB4'] = 'ARABIC SYMBOL TWO DOTS ABOVE' new_names['\uFBB5'] = 'ARABIC SYMBOL TWO DOTS BELOW' new_names['\uFBB6'] = 'ARABIC SYMBOL THREE DOTS ABOVE' new_names['\uFBB7'] = 'ARABIC SYMBOL THREE DOTS BELOW' new_names['\uFBB8'] = 'ARABIC SYMBOL THREE DOTS POINTING DOWNWARDS ABOVE' new_names['\uFBB9'] = 'ARABIC SYMBOL THREE DOTS POINTING DOWNWARDS BELOW' new_names['\uFBBA'] = 'ARABIC SYMBOL FOUR DOTS ABOVE' new_names['\uFBBB'] = 'ARABIC SYMBOL FOUR DOTS BELOW' new_names['\uFBBC'] = 'ARABIC SYMBOL DOUBLE VERTICAL BAR BELOW' new_names['\uFBBD'] = 'ARABIC SYMBOL TWO DOTS VERTICALLY ABOVE' new_names['\uFBBE'] = 'ARABIC SYMBOL TWO DOTS VERTICALLY BELOW' new_names['\uFBBF'] = 'ARABIC SYMBOL RING' new_names['\uFBC0'] = 'ARABIC SYMBOL SMALL TAH ABOVE' new_names['\uFBC1'] = 'ARABIC SYMBOL SMALL TAH BELOW' '''Class Equiv stores the correspondence between a code point and its NFKC-normalized equivalent, for usual characters it is the character itself, for decomposable characters it is the compatibility decompostion.''' class Equiv: code_point = 0 compat = 0 def __init__(self, code_point, compat): self.code_point = code_point self.compat = compat def sort_key(self): return '{:02X}'.format(len(self.compat.lstrip(' '))) + self.compat.lstrip(' ') def __repr__(self): return 'Equiv(0x{:04X}, compat={})'.format(self.code_point, self.compat) equivs = [] for cp in arabic_ranges: normalized = normalize('NFKC', unichr(cp)) equivs.append(Equiv(cp, normalized)) # Sort our characters by length of the decomposition and by decomposition itself equivs.sort(key=Equiv.sort_key) #for e in equivs: # print(e, file=sys.stderr) contextual_form_formats = { 'isolat':'{}', 'final>':'‍{}', 'medial':'‍{}‍', 'initia':'{}‍' } contextual_forms = 'isolat', 'final>', 'medial', 'initia' current_line = {} equiv = None char = None def store_contextual_form(): # print('store_contextual_form', equiv, file=sys.stderr) compat_disp = equiv.compat if equiv.compat[0] == ' ': compat_disp = '\u00A0' + compat_disp[1:] #nonlocal current_line form_cells = StringIO() form = decomposition(char)[1:7] print('{}{}'.format(contextual_form_formats.get(form, '{}').format(compat_disp), '
{}
'.format(ord_mul(compat_disp)) if len(compat_disp) >=2 else ''), file=form_cells) print('{}
{:04X}
'.format(char, equiv.code_point), file=form_cells) #if current_line.get(form, 'not found') != 'not found': print('collision', current_line[form].rstrip(), equiv, file=stderr) current_line[form] = form_cells.getvalue() form_cells.close() table_head = ''' {} ''' def print_table(): global current_line, char def end_line(): for form in contextual_forms: print(current_line.get(form, '').rstrip()) print('') current_line.clear() def print_equiv(equiv): # print('print_equiv', equiv, file=sys.stderr) cp = equiv.code_point char = unichr(cp) print(''.format(' class="nextVersion"' if cp in unicode61_new_ranges else ' class="furtherFuture"' if cp in unicode62_new_ranges else '', 'compat' if len(equiv.compat.replace(' ', '')) > 1 else '{:04X}'.format(ord(equiv.compat.lstrip()[0])))) print(''.format(non_positional_name(char))) if equiv.compat.replace(' ', '') == char: # character is not a decomposable character, or is a standalone combining mark (decomposable to space + combining mark) i = 0 for form in contextual_forms: print(''.format(contextual_form_formats[form].format(char))) i += 1 if { 'T':'isolat', 'U':'isolat', 'C':'isolat', 'R':'final>', 'D':'' }[joining_type(cp)] == form: break if i < 4: print(''.format((4 - i) * 2)) print('') else: end_line() print(table_head.format(caption)) last_equiv = None global equiv for equiv in equivs: char = unichr(equiv.code_point) if last_equiv: #special case FC03 because there is one set of plain YEH WITH HAMZA ABOVE WITH ALEF MAKSURA and one of 'uighur kirghiz' compatibility ligatures if equiv.compat.lstrip() == last_equiv.compat.lstrip() and equiv.code_point != 0xFC03: store_contextual_form() else: print_equiv(last_equiv) if equiv.compat != char: store_contextual_form() last_equiv = equiv print_equiv(last_equiv) print('
General
Unicode
Name Contextual Forms
IsolatedIsolated (compat)FinalFinal (compat) MedialMedial (compat)InitialInitial (compat)
{}{}{}
') def ord_mul(s): code_points = '' for c in s: code_points += '{:X} '.format(ord(c)) return code_points[:-1] html_heading = Template(''' $title

$title

Choose the font to test:

''') caption='''  New characters in Unicode 6.1, which will be published in February 2012. These can be relied upon and will not change or be removed. See the Unicode chart for the new block Arabic Extended-A, and for more about these characters, see N3734 for U+0604, the complete proposal for most characters, N3791 for U+08F0-U+08F3.
  Future new characters in Unicode 6.2. These can will probably be standardized this way, but could in principle still change or be removed. See N3990, in 4.2 Orthography for U+0605, N4072 proposal about U+08AD-U+08B1, and N3989 proposal about U+08FF.''' def print_arabic_test_page(): print(html_heading.substitute(title='Test for Unicode Arabic range')) print_table() print('') print('') print_arabic_test_page()