from prelude import *
from botaccount import *
from data.accent import *
"deaudio.py"
zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
de = mwc.Site('de.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
de.login(UN, PWD)
zpgl = list(zh.Pages['Template:IPA'].embed(namespace = 0))
pn = list(map(lambda p: p.name, zpgl))
pn_ = [x.replace(' ', '_') for x in pn]
n = len(pn)
defaultsummary = '[[User:CrowleyBot/task/21|根据commons文件列表增加德语音频]]'
ztxtl = [x.text() for x in zpgl]
ztxts = [x.split('\n') for x in ztxtl]
for i in range(n):
wtel()
skiptemp = {'Template:Adjektiv-Tabelle', 'Template:De-clothes-words', 'Template:Deklinationsseite Adjektiv', 'Template:Deutsch Substantiv m stark', 'Template:Substantiv-Tabelle', 'Template:Verb-Tabelle', 'Template:Da-nm', 'Template:De', 'Template:De-aj', 'Template:De-aj-dk', 'Template:De-av', 'Template:De-cat', 'Template:De-decl-noun', 'Template:De-nm', 'Template:De-nm-dk', 'Template:De-nm-dk-f', 'Template:De-nm-dk-m', 'Template:De-nm-dk-n', 'Template:De-nm-prefix', 'Template:De-nm-suffix', 'Template:De-prnc', 'Template:De-vb', 'Template:De-vb-2', 'Template:De-vb-3', 'Template:De-vb-laden', 'Template:De-vb-prefix', 'Template:De-vb-stehen'}
skip = {i for i in range(n) if {x.name for x in zpgl[i].templates()} & skiptemp}
todo = set(range(n)) - skip
todo_add = {i for i in todo if '发音===' in ztxtl[i].simp()} # 需要检测“读音”章节。故最好是在全站章节标题规范化后进行
todo_new = todo - todo_add
otxt_default = '===發音===\n'
itxt = [''] * n
otxt = [otxt_default] * n
for i in todo_add.copy():
try:
ztxt_pron()
except:
pass
if itxt[i]:
otxt[i] = itxt[i][:-1] # 经过wtel()每章节后必有两个回车,去掉一个
else:
todo_add.remove(i)
todo.remove(i)
# zcat commonswiki-20210920-all-titles.gz | grep '6 De-' | grep 'ogg$' | sed 's/^6\t//' > audio-de.txt
# zcat commonswiki-20210920-all-titles.gz | grep '6 De-' | grep 'ogg$' | sed 's/^6\t//' > audio-de.txt
audiode = fromfile('dump/audio-de.txt')
audioq188 = fromfile('dump/audio-q188.txt')
audiocom = defaultdict(set)
for i in todo_new.copy():
dtxtl[i] = de.Pages[pn[i]].text()
ext_de() # extract
# dewikt也可能没有德语章节
if dtxtl[i]:
add_ipa()
add_hyph()
add_rhyme()
for i in todo:
gen_audiocom()
if i in todo_add:
cut_audiocom()
add_audiocom()
todo_add = {i for i in todo_add if otxt[i] != otxt_default}
todo_new = {i for i in todo_new if otxt[i] != otxt_default}
todo = todo_add | todo_new
for i in todo_new.copy():
add_pron() # 部分词条有标题行却没有L3章节,故add_pron()后需检查
if not itxt[i]:
todo_new.remove(i)
# 在最后编辑前,需分别检查itxt和otxt有无奇葩格式
for i in todo:
ztxtl[i] = ztxtl[i].replace(itxt[i], otxt[i] + '\n')
ztxts[i] = ztxtl[i].split('\n')
tryedit(zpgl[i], ztxtl[i], defaultsummary)
def wtel():
ntxt = ztxtl[i]
ntxt = re.sub(r'^([#\*][#\*:;]*)([^ #\*:;])', r'\1 \2', ntxt, flags=re.M)
ntxt = re.sub(r'==([^=]*)==', lambda m: '==%s==' % m[1].strip(), ntxt)
ntxt = re.sub(r'\n\n+', '\n', ntxt)
ntxt = ntxt.replace('==', '\n\n==', 1)
ntxt = re.sub(r'([^\n])\n==', r'\1\n\n==', ntxt)
ntxt = ntxt.replace('\n\n==', '\n==', 1)
ntxt = re.sub(r'\n+----\n', '\n\n----\n', ntxt)
ntxt = ntxt.strip('\n')
# 先不处理Also,只处理第一行奇葩格式also
ntxt = re.sub(r'^: *\{\{also', '{{also', ntxt)
ztxtl[i] = ntxt
ztxts[i] = ztxtl[i].split('\n')
def ext_de():
dast = wtp.parse(dtxtl[i])
for j, dsec in enumerate(dast.get_sections(level = 2)):
dsectitle = dsec.title.strip(' ') #不放心
if not dsectitle.endswith('({{Sprache|Deutsch}})'):
continue
dtxtl[i] = dsec.string
break
else:
dtxtl[i] = ''
def add_ipa():
ipas = subp.run(['grep', '{{IPA}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
ipal = []
def ext_ipa(x):
l = []
for m in re.finditer(r'\{\{Lautschrift\|([^{}|]*)', x):
if not '=' in m[1]:
l.append('[%s]' % m[1])
if l:
ipal.append(l)
for x in ipas.split('\n'):
ext_ipa(x)
if len(ipal) == 1:
otxt[i] += '* {{IPA|de|%s}}\n' % '|'.join(ipal[0])
elif len(ipal) > 1:
s = set(ipal[0])
for l in ipal[1:]:
if set(l) != s:
break
else:
otxt[i] += '* {{IPA|de|%s}}\n' % '|'.join(ipal[0])
return
print((i, pn[i], "多种IPA"))
for l in ipal:
print('* {{IPA|de|%s}}' % '|'.join(l))
def add_hyph():
hyphs = subp.run(['grep', '--no-group-separator', '-A', '1', '{{Worttrennung}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
hyphl = []
for x in hyphs.split('\n'):
if not '{{Worttrennung}}' in x:
# 部分德语条目漏加逗号,必须用'{'再次split。暂时不考虑一行两个的情况
y = x.split('\n')[1].split(',')[0].split('{')[0].strip().replace(':', '').replace('·', '|').replace(' ', '|')
if y:
hyphl.append('* {{hyphenation|de|%s}}\n' % y)
if len(hyphl) == 1:
otxt[i] += hyphl[0]
elif len(hyphl) > 1:
s = hyphl[0]
for x in hyphl[1:]:
if x != s:
break
else:
otxt[i] += hyphl[0]
return
print((i, pn[i], "多种hyph"))
for x in hyphl:
print(x, end="")
def add_rhyme():
rhymes = subp.run(['grep', '{{Reime}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
rhymel = []
def ext_rhyme(x):
l = []
for m in re.finditer(r'\{\{Reim\|([^{}|]*)', x):
if not '=' in m[1]:
l.append('[%s]' % m[1])
if l:
rhymel.append(l)
for x in rhymes.split('\n'):
ext_rhyme(x)
if len(rhymel) == 1:
otxt[i] += '* {{rhymes|de|%s}}\n' % '|'.join(rhymel[0])
elif len(rhymel) > 1:
s = set(rhymel[0])
for l in rhymel[1:]:
if set(l) != s:
break
else:
otxt[i] += '* {{rhymes|de|%s}}\n' % '|'.join(rhymel[0])
return
print((i, pn[i], "多种rhyme"))
for l in rhymel:
print('* {{rhymes|de|%s}}' % '|'.join(l))
"""
:{{IPA}} {{Lautschrift|dɔɪ̯t͡ʃ}}
:{{Hörbeispiele}} {{Audio|De-Deutsch.ogg}}, {{Audio|De-Deutsch2.ogg}}
:{{Reime}} {{Reim|ɔɪ̯t͡ʃ|Deutsch}}
"""
@fct.total_ordering
class node:
def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
if dummy:
s.kyu = kyu
s.title = dummy
return
s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
s.l, s.f, s.r = None, None, None
def __bool__(s):
return isinstance(s.title, str)
def __eq__(x, y):
return x.kyu == y.kyu
def __lt__(x, y):
return x.kyu < y.kyu
def __str__(s):
return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
def __repr__(s):
return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
@property
def c(s):
n1 = s.r
return n1.a if n1 else n1.title
@property
def g(s):
if _l := s.l:
return _l.a
else:
return s.c
def printtree(s, i=0):
print(' ' * i + str(s))
if s.l:
s.l.printtree(i + 1)
if s.r:
s.r.printtree(i)
def gendummy(s):
if s.r is None:
s.r = node(kyu=s.kyu, dummy=s.f.c)
if s.r:
s.r.gendummy()
if s.l:
s.l.gendummy()
def selectson(s, k):
f = k if callable(k) else lambda x: simp(x.title) == k
_c = s.l
ret = []
while _c:
if f(_c):
ret.append(_c)
_c = _c.r
if len(ret) == 0:
raise ValueError((11, i, str(s)))
if len(ret) > 1:
raise ValueError((12, i, str(s)))
return ret[0]
def ztxt_pron():
# *? for non-greedy
# =在zhwikt原则上不见于合法标题,其他地方不清楚
rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
def _f1(m):
# need to check sameness in zhwikt
return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
ztxt = ztxtl[i]
zhl = list(map(_f1, rx1.finditer(ztxt)))
zrt = node(0, '', 0, 0, '')
_c = zrt
# 当h2下先后有h4和h3时出错
for nd in zhl:
while nd < _c:
_c = _c.f
if nd == _c:
_c.r = nd
nd.f = _c.f
_c = nd
else:
if _c.l:
raise ValueError((14, i))
_c.l = nd
nd.f = _c
_c = nd
zrt.r = node(dummy=len(ztxt))
zrt.gendummy()
zsecnode = zrt.selectson('德语').selectson('发音')
itxt[i] = ztxt[zsecnode.a:zsecnode.c]
def gen_audiocom():
def bsearch(a, x):
i = bisect.bisect_left(a, x)
if i != len(a) and a[i] == x:
return i
return False
if type(bsearch(audiode, 'De-%s.ogg' % pn_[i])) == int:
audiocom[i].add(1)
if type(bsearch(audiode, 'De-%s2.ogg' % pn_[i])) == int:
audiocom[i].add(2)
if type(bsearch(audiode, 'De-at-%s.ogg' % pn_[i])) == int:
audiocom[i].add(5)
if type(bsearch(audiode, 'De-at-%s2.ogg' % pn_[i])) == int:
audiocom[i].add(6)
if type(bsearch(audio188, 'LL-Q188_(deu)-Sebastian_Wallroth-%s.wav' % pn_[i])) == int:
audiocom[i].add(0)
def cut_audiocom():
# 本工作最好在一致化语言参数后进行,否则会出现de章节下lang=dde
for m in re.finditer(r'\{\{audio\|de\|([^{}|]*)', ztxtl[i]):
bt = m[1].strip().replace('_', ' ')
bt = bt[0].upper() + bt[1:]
p = pn[i]
p1 = '-' + p[0].lower() + p[1:]
p2 = '-' + p[0].upper() + p[1:]
bt = bt.replace(p1, '').replace(p2, '')
bt = re.sub(r'([a-z])\.', r'\g<1>1.', bt)
bt = bt[:-4] + bt[-4:].lower() # 处理.OGG
if md := re.match(r'De(\d)\.ogg', bt):
audiocom[i].discard(int(md[1]))
elif md := re.match(r'De-at(\d)\.ogg', bt):
audiocom[i].discard(int(md[1]) + 4)
elif md := re.match(r'LL-Q188 \(deu\)-Sebastian Wallroth1.wav', bt):
audiocom[i].discard(0)
def add_audiocom():
for t in audiocom[i]:
if t == 0:
p2 = 'LL-Q188 (deu)-Sebastian Wallroth-%s.wav|音頻(德國)' % pn[i]
elif 1 <= t <= 4:
if t == 1:
t = ''
p2 = 'De-%s%s.ogg|音頻' % (pn[i], t)
elif 5 <= t <= 8:
t -= 4
if t == 1:
t = ''
p2 = 'De-at-%s%s.ogg|音頻(奧地利)' % (pn[i], t)
otxt[i] += '* {{audio|de|%s}}\n' % p2
def add_pron():
j1 = its.nth(its.locate(ztxts[i], lambda x: re.match(r'==德[語语]==', x)), 0, -1)
if j1 == -1:
return
for j, x in its.islice(enumerate(ztxts[i]), j1, None):
if re.search(r'[词詞]===$', x):
# 需要标准化,保证每个L3章节前有空行
ztxts[i][j - 1] = '\n' + otxt[i]
ztxtl[i] = '\n'.join(ztxts[i])
break
else:
# 未发现L3章节,可能有多个词源,或格式不规范
print((i, pn[i], "未发现L3章节"))
return
ztxts[i] = ztxtl[i].split('\n')
itxt[i] = otxt_default + '\n'
def verify():
with open('/tmp/tmpv', 'w') as f:
for i in todo_add:
print(('🐮', i, pn[i]), file=f)
print(otxt[i], file=f)