123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361 |
- # unicode.py
- import sys
- from itertools import filterfalse
- from typing import List, Tuple, Union
- class _lazyclassproperty:
- def __init__(self, fn):
- self.fn = fn
- self.__doc__ = fn.__doc__
- self.__name__ = fn.__name__
- def __get__(self, obj, cls):
- if cls is None:
- cls = type(obj)
- if not hasattr(cls, "_intern") or any(
- cls._intern is getattr(superclass, "_intern", [])
- for superclass in cls.__mro__[1:]
- ):
- cls._intern = {}
- attrname = self.fn.__name__
- if attrname not in cls._intern:
- cls._intern[attrname] = self.fn(cls)
- return cls._intern[attrname]
- UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
- class unicode_set:
- """
- A set of Unicode characters, for language-specific strings for
- ``alphas``, ``nums``, ``alphanums``, and ``printables``.
- A unicode_set is defined by a list of ranges in the Unicode character
- set, in a class attribute ``_ranges``. Ranges can be specified using
- 2-tuples or a 1-tuple, such as::
- _ranges = [
- (0x0020, 0x007e),
- (0x00a0, 0x00ff),
- (0x0100,),
- ]
- Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
- A unicode set can also be defined using multiple inheritance of other unicode sets::
- class CJK(Chinese, Japanese, Korean):
- pass
- """
- _ranges: UnicodeRangeList = []
- @_lazyclassproperty
- def _chars_for_ranges(cls):
- ret = []
- for cc in cls.__mro__:
- if cc is unicode_set:
- break
- for rr in getattr(cc, "_ranges", ()):
- ret.extend(range(rr[0], rr[-1] + 1))
- return [chr(c) for c in sorted(set(ret))]
- @_lazyclassproperty
- def printables(cls):
- """all non-whitespace characters in this range"""
- return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
- @_lazyclassproperty
- def alphas(cls):
- """all alphabetic characters in this range"""
- return "".join(filter(str.isalpha, cls._chars_for_ranges))
- @_lazyclassproperty
- def nums(cls):
- """all numeric digit characters in this range"""
- return "".join(filter(str.isdigit, cls._chars_for_ranges))
- @_lazyclassproperty
- def alphanums(cls):
- """all alphanumeric characters in this range"""
- return cls.alphas + cls.nums
- @_lazyclassproperty
- def identchars(cls):
- """all characters in this range that are valid identifier characters, plus underscore '_'"""
- return "".join(
- sorted(
- set(
- "".join(filter(str.isidentifier, cls._chars_for_ranges))
- + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
- + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
- + "_"
- )
- )
- )
- @_lazyclassproperty
- def identbodychars(cls):
- """
- all characters in this range that are valid identifier body characters,
- plus the digits 0-9, and · (Unicode MIDDLE DOT)
- """
- return "".join(
- sorted(
- set(
- cls.identchars
- + "0123456789·"
- + "".join(
- [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
- )
- )
- )
- )
- @_lazyclassproperty
- def identifier(cls):
- """
- a pyparsing Word expression for an identifier using this range's definitions for
- identchars and identbodychars
- """
- from pyparsing import Word
- return Word(cls.identchars, cls.identbodychars)
- class pyparsing_unicode(unicode_set):
- """
- A namespace class for defining common language unicode_sets.
- """
- # fmt: off
- # define ranges in language character sets
- _ranges: UnicodeRangeList = [
- (0x0020, sys.maxunicode),
- ]
- class BasicMultilingualPlane(unicode_set):
- """Unicode set for the Basic Multilingual Plane"""
- _ranges: UnicodeRangeList = [
- (0x0020, 0xFFFF),
- ]
- class Latin1(unicode_set):
- """Unicode set for Latin-1 Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0020, 0x007E),
- (0x00A0, 0x00FF),
- ]
- class LatinA(unicode_set):
- """Unicode set for Latin-A Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0100, 0x017F),
- ]
- class LatinB(unicode_set):
- """Unicode set for Latin-B Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0180, 0x024F),
- ]
- class Greek(unicode_set):
- """Unicode set for Greek Unicode Character Ranges"""
- _ranges: UnicodeRangeList = [
- (0x0342, 0x0345),
- (0x0370, 0x0377),
- (0x037A, 0x037F),
- (0x0384, 0x038A),
- (0x038C,),
- (0x038E, 0x03A1),
- (0x03A3, 0x03E1),
- (0x03F0, 0x03FF),
- (0x1D26, 0x1D2A),
- (0x1D5E,),
- (0x1D60,),
- (0x1D66, 0x1D6A),
- (0x1F00, 0x1F15),
- (0x1F18, 0x1F1D),
- (0x1F20, 0x1F45),
- (0x1F48, 0x1F4D),
- (0x1F50, 0x1F57),
- (0x1F59,),
- (0x1F5B,),
- (0x1F5D,),
- (0x1F5F, 0x1F7D),
- (0x1F80, 0x1FB4),
- (0x1FB6, 0x1FC4),
- (0x1FC6, 0x1FD3),
- (0x1FD6, 0x1FDB),
- (0x1FDD, 0x1FEF),
- (0x1FF2, 0x1FF4),
- (0x1FF6, 0x1FFE),
- (0x2129,),
- (0x2719, 0x271A),
- (0xAB65,),
- (0x10140, 0x1018D),
- (0x101A0,),
- (0x1D200, 0x1D245),
- (0x1F7A1, 0x1F7A7),
- ]
- class Cyrillic(unicode_set):
- """Unicode set for Cyrillic Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0400, 0x052F),
- (0x1C80, 0x1C88),
- (0x1D2B,),
- (0x1D78,),
- (0x2DE0, 0x2DFF),
- (0xA640, 0xA672),
- (0xA674, 0xA69F),
- (0xFE2E, 0xFE2F),
- ]
- class Chinese(unicode_set):
- """Unicode set for Chinese Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x2E80, 0x2E99),
- (0x2E9B, 0x2EF3),
- (0x31C0, 0x31E3),
- (0x3400, 0x4DB5),
- (0x4E00, 0x9FEF),
- (0xA700, 0xA707),
- (0xF900, 0xFA6D),
- (0xFA70, 0xFAD9),
- (0x16FE2, 0x16FE3),
- (0x1F210, 0x1F212),
- (0x1F214, 0x1F23B),
- (0x1F240, 0x1F248),
- (0x20000, 0x2A6D6),
- (0x2A700, 0x2B734),
- (0x2B740, 0x2B81D),
- (0x2B820, 0x2CEA1),
- (0x2CEB0, 0x2EBE0),
- (0x2F800, 0x2FA1D),
- ]
- class Japanese(unicode_set):
- """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
- class Kanji(unicode_set):
- "Unicode set for Kanji Unicode Character Range"
- _ranges: UnicodeRangeList = [
- (0x4E00, 0x9FBF),
- (0x3000, 0x303F),
- ]
- class Hiragana(unicode_set):
- """Unicode set for Hiragana Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x3041, 0x3096),
- (0x3099, 0x30A0),
- (0x30FC,),
- (0xFF70,),
- (0x1B001,),
- (0x1B150, 0x1B152),
- (0x1F200,),
- ]
- class Katakana(unicode_set):
- """Unicode set for Katakana Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x3099, 0x309C),
- (0x30A0, 0x30FF),
- (0x31F0, 0x31FF),
- (0x32D0, 0x32FE),
- (0xFF65, 0xFF9F),
- (0x1B000,),
- (0x1B164, 0x1B167),
- (0x1F201, 0x1F202),
- (0x1F213,),
- ]
- 漢字 = Kanji
- カタカナ = Katakana
- ひらがな = Hiragana
- _ranges = (
- Kanji._ranges
- + Hiragana._ranges
- + Katakana._ranges
- )
- class Hangul(unicode_set):
- """Unicode set for Hangul (Korean) Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x1100, 0x11FF),
- (0x302E, 0x302F),
- (0x3131, 0x318E),
- (0x3200, 0x321C),
- (0x3260, 0x327B),
- (0x327E,),
- (0xA960, 0xA97C),
- (0xAC00, 0xD7A3),
- (0xD7B0, 0xD7C6),
- (0xD7CB, 0xD7FB),
- (0xFFA0, 0xFFBE),
- (0xFFC2, 0xFFC7),
- (0xFFCA, 0xFFCF),
- (0xFFD2, 0xFFD7),
- (0xFFDA, 0xFFDC),
- ]
- Korean = Hangul
- class CJK(Chinese, Japanese, Hangul):
- """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
- class Thai(unicode_set):
- """Unicode set for Thai Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0E01, 0x0E3A),
- (0x0E3F, 0x0E5B)
- ]
- class Arabic(unicode_set):
- """Unicode set for Arabic Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0600, 0x061B),
- (0x061E, 0x06FF),
- (0x0700, 0x077F),
- ]
- class Hebrew(unicode_set):
- """Unicode set for Hebrew Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0591, 0x05C7),
- (0x05D0, 0x05EA),
- (0x05EF, 0x05F4),
- (0xFB1D, 0xFB36),
- (0xFB38, 0xFB3C),
- (0xFB3E,),
- (0xFB40, 0xFB41),
- (0xFB43, 0xFB44),
- (0xFB46, 0xFB4F),
- ]
- class Devanagari(unicode_set):
- """Unicode set for Devanagari Unicode Character Range"""
- _ranges: UnicodeRangeList = [
- (0x0900, 0x097F),
- (0xA8E0, 0xA8FF)
- ]
- BMP = BasicMultilingualPlane
- # add language identifiers using language Unicode
- العربية = Arabic
- 中文 = Chinese
- кириллица = Cyrillic
- Ελληνικά = Greek
- עִברִית = Hebrew
- 日本語 = Japanese
- 한국어 = Korean
- ไทย = Thai
- देवनागरी = Devanagari
- # fmt: on
|