180 lines
5.3 KiB
Modula-2
180 lines
5.3 KiB
Modula-2
#
|
||
# Japanese charcter category map
|
||
#
|
||
# $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
|
||
#
|
||
|
||
###################################################################################
|
||
#
|
||
# CHARACTER CATEGORY DEFINITION
|
||
#
|
||
# CATEGORY_NAME INVOKE GROUP LENGTH
|
||
#
|
||
# - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
|
||
# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon
|
||
# - GROUP: 1/0: make a new word by grouping the same chracter category
|
||
# - LENGTH: n: 1 to n length new words are added
|
||
#
|
||
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
||
SPACE 0 1 0
|
||
KANJI 0 0 2
|
||
SYMBOL 1 1 0
|
||
NUMERIC 1 1 0
|
||
ALPHA 1 1 0
|
||
HIRAGANA 0 1 2
|
||
KATAKANA 1 1 2
|
||
KANJINUMERIC 0 1 0 #change INVOKE 1->0
|
||
GREEK 1 1 0
|
||
CYRILLIC 1 1 0
|
||
|
||
###################################################################################
|
||
#
|
||
# CODE(UCS2) TO CATEGORY MAPPING
|
||
#
|
||
|
||
# SPACE
|
||
0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
|
||
0x000D SPACE
|
||
0x0009 SPACE
|
||
0x000B SPACE
|
||
0x000A SPACE
|
||
|
||
# ASCII
|
||
0x0021..0x002F SYMBOL #!"#$%&'()*+,-./
|
||
0x0030..0x0039 NUMERIC #0-9
|
||
0x003A..0x0040 SYMBOL #:;<=>?@
|
||
0x0041..0x005A ALPHA #A-Z
|
||
0x005B..0x0060 SYMBOL #[\]^_`
|
||
0x0061..0x007A ALPHA #a-z
|
||
0x007B..0x007E SYMBOL #{|}~
|
||
|
||
# Latin
|
||
0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿
|
||
0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö
|
||
0x00D7 SYMBOL # Latin 1 #×
|
||
0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö
|
||
0x00F7 SYMBOL # Latin 1 #÷
|
||
0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ
|
||
0x0100..0x017F ALPHA # Latin Extended A
|
||
0x0180..0x0236 ALPHA # Latin Extended B
|
||
0x1E00..0x1EF9 ALPHA # Latin Extended Additional
|
||
|
||
# CYRILLIC
|
||
0x0400..0x04F9 CYRILLIC #Ѐ->ӹ
|
||
0x0500..0x050F CYRILLIC # Cyrillic supplementary
|
||
|
||
# GREEK
|
||
0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ
|
||
|
||
# HIRAGANA
|
||
0x3041..0x309F HIRAGANA
|
||
|
||
# KATAKANA
|
||
#0x30A1..0x30FF KATAKANA
|
||
0x30A1..0x30FA KATAKANA
|
||
0x30FC..0x30FF KATAKANA
|
||
0x31F0..0x31FF KATAKANA # Small KU .. Small RO
|
||
# 0x30FC KATAKANA HIRAGANA # ー
|
||
0x30A1 NOOOVBOW # Small A
|
||
0x30A3 NOOOVBOW # ...
|
||
0x30A5 NOOOVBOW
|
||
0x30A7 NOOOVBOW
|
||
0x30A9 NOOOVBOW
|
||
0x30E3 NOOOVBOW
|
||
0x30E5 NOOOVBOW
|
||
0x30E7 NOOOVBOW
|
||
0x30EE NOOOVBOW # Small Wa
|
||
0x30FC..0x30FE NOOOVBOW # 'ー' 'ヽ' 'ヾ'
|
||
|
||
# Half KATAKANA
|
||
0xFF66..0xFF9D KATAKANA
|
||
0xFF9E..0xFF9F KATAKANA
|
||
|
||
# KANJI
|
||
0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement
|
||
0x2F00..0x2FD5 KANJI
|
||
0x3005 KANJI NOOOVBOW
|
||
0x3007 KANJI
|
||
0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention
|
||
#0x4E00..0x9FA5 KANJI
|
||
0x4E00..0x9FFF KANJI
|
||
0xF900..0xFA2D KANJI
|
||
0xFA30..0xFA6A KANJI
|
||
|
||
|
||
# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
|
||
0x4E00 KANJINUMERIC KANJI
|
||
0x4E8C KANJINUMERIC KANJI
|
||
0x4E09 KANJINUMERIC KANJI
|
||
0x56DB KANJINUMERIC KANJI
|
||
0x4E94 KANJINUMERIC KANJI
|
||
0x516D KANJINUMERIC KANJI
|
||
0x4E03 KANJINUMERIC KANJI
|
||
0x516B KANJINUMERIC KANJI
|
||
0x4E5D KANJINUMERIC KANJI
|
||
0x5341 KANJINUMERIC KANJI
|
||
0x767E KANJINUMERIC KANJI
|
||
0x5343 KANJINUMERIC KANJI
|
||
0x4E07 KANJINUMERIC KANJI
|
||
0x5104 KANJINUMERIC KANJI
|
||
0x5146 KANJINUMERIC KANJI
|
||
|
||
# ZENKAKU
|
||
0xFF10..0xFF19 NUMERIC
|
||
0xFF21..0xFF3A ALPHA
|
||
0xFF41..0xFF5A ALPHA
|
||
0xFF01..0xFF0F SYMBOL #!->/
|
||
0xFF1A..0xFF20 SYMBOL #:->@
|
||
0xFF3B..0xFF40 SYMBOL #[->`
|
||
0xFF5B..0xFF65 SYMBOL #{->・
|
||
0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form
|
||
|
||
# OTHER SYMBOLS
|
||
0x2000..0x206F SYMBOL # General Punctuation
|
||
0x2070..0x209F NUMERIC # Superscripts and Subscripts
|
||
0x20A0..0x20CF SYMBOL # Currency Symbols
|
||
0x2100..0x214F SYMBOL # Letterlike Symbols
|
||
0x2150..0x218F NUMERIC # Number forms
|
||
0x2100..0x214B SYMBOL # Letterlike Symbols
|
||
0x2190..0x21FF SYMBOL # Arrow
|
||
0x2200..0x22FF SYMBOL # Mathematical Operators
|
||
0x2300..0x23FF SYMBOL # Miscellaneuos Technical
|
||
0x2460..0x24FF SYMBOL # Enclosed NUMERICs
|
||
0x2501..0x257F SYMBOL # Box Drawing
|
||
0x2580..0x259F SYMBOL # Block Elements
|
||
0x25A0..0x25FF SYMBOL # Geometric Shapes
|
||
0x2600..0x26FE SYMBOL # Miscellaneous Symbols
|
||
0x2700..0x27BF SYMBOL # Dingbats
|
||
0x27F0..0x27FF SYMBOL # Supplemental Arrows A
|
||
0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A
|
||
0x2800..0x28FF SYMBOL # Braille Patterns
|
||
0x2900..0x297F SYMBOL # Supplemental Arrows B
|
||
0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows
|
||
0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators
|
||
0x3300..0x33FF SYMBOL
|
||
0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months
|
||
0x3000..0x303F SYMBOL # CJK Symbol and Punctuation
|
||
0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms
|
||
0xFE50..0xFE6B SYMBOL # Small Form Variants
|
||
|
||
# added 2006/3/13
|
||
0x3007 SYMBOL KANJINUMERIC
|
||
|
||
# added 2018/11/30
|
||
0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks
|
||
|
||
# Unicode combining symbols 2021/12/03
|
||
# from https://en.wikipedia.org/wiki/Combining_character
|
||
0x0300..0x036F ALL NOOOVBOW # Combining Diacritical Marks
|
||
0x1AB0..0x1AFF ALL NOOOVBOW # Combining Diacritical Marks Extended
|
||
0x1DC0..0x1DFF ALL NOOOVBOW # Combining Diacritical Marks Supplement
|
||
0x20D0..0x20FF ALL NOOOVBOW # Combining Diacritical Marks for Symbols
|
||
0xFE20..0xFE2F ALL NOOOVBOW # Combining Half Marks
|
||
0xFE00..0xFE0F ALL NOOOVBOW # https://codepoints.net/variation_selectors
|
||
0x1F3FB..0x1F3FE ALL NOOOVBOW # emoji skin tone modifiers https://codepoints.net/U+1F3FF
|
||
|
||
# Combination marks
|
||
0x200C..0x200D ALL NOOOVBOW2 # Zero Width Non-Joiner/Joiner
|
||
|
||
# END OF TABLE
|