1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
=
return True
return True
=
=
=
=
=
=
=
=
=
=
, =
=
# if fc in exist or jc in exist:
# print(fc, jc)
# mulit.add(fc)
# mulit.add(jc)
# if is_cn(fc) and is_cn(jc):
# continue
# 只有 钟
# def cjk_detect(texts):
# count = defaultdict(int)
# for i in texts:
# i = ord(i)
# if i >= 0xAC00 and i <= 0xD7A3:
# count["ko"] += 1
# elif i >= 0x3040 and i <= 0x30FF:
# count["ja"] += 1
# elif i >= 0x4E00 and i <= 0x9FFF:
# count["zh"] += 1
#
# print(count)
# # # korean
# # if re.search("[\uac00-\ud7a3]", texts):
# # return "ko"
# # # japanese
# # if re.search("[\u3040-\u30ff]", texts):
# # return "ja"
# # # chinese
# # for i in [
# # "[\u4e00-\u9FFF]",
# # ]:
# # if re.search(i, texts):
# # return "zh"
# return None
#
#
# def test_cjk_detect():
# # Pure English
# assert cjk_detect("Is Obstruction an Impeachable Offense? History Says Yes") is None
# # Pure French
# assert (
# cjk_detect(
# "Damian Lillard a réussi un nouveau shoot de la victoire"
# " au buzzer à très longue distance"
# )
# is None
# )
# # Simplified Chinese
# assert (
# cjk_detect(
# "2009年,波音公司(Boeing)在查尔斯顿附近的新厂破土动工时,曾宣扬这里是最先进的制造中心"
# ",将制造一款世界上最先进的飞机。但在接下来的十年里,这家生产787梦想客机的工厂一直受到做"
# "工粗糙和监管不力的困扰,危及航空安全。"
# )
# == "zh"
# )
# # Traditional Chinese
# assert (
# cjk_detect("北查爾斯頓工廠的安全漏洞已經引起了航空公司和監管機構的密切關注。")
# == "zh"
# )
# # Japanese
# assert (
# cjk_detect("日産自動車は24日、2019年3月期の連結業績予想を下方修正した。")
# == "ja"
# )
# # Korean
# assert cjk_detect("투서로 뜨고 투서에 지나") == "ko"
# # Korean with a Chinese character
# assert (
# cjk_detect("北 외무성 간부 총살설 주민들 사이서 확산…하노이 회담 실패 때문")
# == "ko"
# )
#
#
# def print_incorrect_cases():
# texts = "투서로 뜨고 투서에 지나"
# print(texts, "expected: ja actual:", cjk_detect(texts))
# # Japanese
# texts = "日産自動車、営業益45%減 前期下方修正"
# print(texts, "expected: ja actual:", cjk_detect(texts))
# # Traditional Chinese with Japanese hiragana
# texts = "健康の油切 好吃の涼麵"
# print(texts, "expected: zh actual:", cjk_detect(texts))
# # Traditional Chinese with Japanese katakana punctuation
# texts = "鐵腕・都鐸王朝(五):文藝復興最懂穿搭的高富帥——亨利八世"
# print(texts, "expected: zh actual:", cjk_detect(texts))
#
#
# if __name__ == "__main__":
# # Correct cases
# # test_cjk_detect()
# # Incorrect cases
# print_incorrect_cases()