技术细节 编辑

  • IPA、断字来自dewikt
  • 音频来自commons的文件列表。大规模运行应二分查找。
  • dewikt要检查,zhwikt原有章节是否有奇葩格式也要检查。相比之下还是2010年Wjcdbot写的条目简单,就一行表示意思
  • 参见en:User:DerbethBotperlwiki

代码 编辑

from prelude import  *
from botaccount import *
from data.accent import *

"deaudio.py"

zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
de = mwc.Site('de.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
de.login(UN, PWD)

zpgl = list(zh.Pages['Template:IPA'].embed(namespace = 0))
pn = list(map(lambda p: p.name, zpgl))
pn_ = [x.replace(' ', '_') for x in pn]
n = len(pn)
defaultsummary = '[[User:CrowleyBot/task/21|根据commons文件列表增加德语音频]]'

ztxtl = [x.text() for x in zpgl]
ztxts = [x.split('\n') for x in ztxtl]
for i in range(n):
    wtel()

skiptemp = {'Template:Adjektiv-Tabelle', 'Template:De-clothes-words', 'Template:Deklinationsseite Adjektiv', 'Template:Deutsch Substantiv m stark', 'Template:Substantiv-Tabelle', 'Template:Verb-Tabelle', 'Template:Da-nm', 'Template:De', 'Template:De-aj', 'Template:De-aj-dk', 'Template:De-av', 'Template:De-cat', 'Template:De-decl-noun', 'Template:De-nm', 'Template:De-nm-dk', 'Template:De-nm-dk-f', 'Template:De-nm-dk-m', 'Template:De-nm-dk-n', 'Template:De-nm-prefix', 'Template:De-nm-suffix', 'Template:De-prnc', 'Template:De-vb', 'Template:De-vb-2', 'Template:De-vb-3', 'Template:De-vb-laden', 'Template:De-vb-prefix', 'Template:De-vb-stehen'}
skip = {i for i in range(n) if {x.name for x in zpgl[i].templates()} & skiptemp}
todo = set(range(n)) - skip
todo_add = {i for i in todo if '发音===' in ztxtl[i].simp()} # 需要检测“读音”章节。故最好是在全站章节标题规范化后进行
todo_new = todo - todo_add

otxt_default = '===發音===\n'
itxt = [''] * n
otxt = [otxt_default] * n

for i in todo_add.copy():
    try:
        ztxt_pron()
    except:
        pass
    if itxt[i]:
        otxt[i] = itxt[i][:-1] # 经过wtel()每章节后必有两个回车,去掉一个
    else:
        todo_add.remove(i)
        todo.remove(i)

# zcat commonswiki-20210920-all-titles.gz | grep '6        De-' | grep 'ogg$' | sed 's/^6\t//' > audio-de.txt
# zcat commonswiki-20210920-all-titles.gz | grep '6        De-' | grep 'ogg$' | sed 's/^6\t//' > audio-de.txt
audiode = fromfile('dump/audio-de.txt')
audioq188 = fromfile('dump/audio-q188.txt')
audiocom = defaultdict(set)

for i in todo_new.copy():
    dtxtl[i] = de.Pages[pn[i]].text()
    ext_de() # extract
    # dewikt也可能没有德语章节
    if dtxtl[i]:
        add_ipa()
        add_hyph()
        add_rhyme()

for i in todo:
    gen_audiocom()
    if i in todo_add:
        cut_audiocom()
    add_audiocom()

todo_add = {i for i in todo_add if otxt[i] != otxt_default}
todo_new = {i for i in todo_new if otxt[i] != otxt_default}
todo = todo_add | todo_new

for i in todo_new.copy():
    add_pron() # 部分词条有标题行却没有L3章节,故add_pron()后需检查
    if not itxt[i]:
        todo_new.remove(i)

# 在最后编辑前,需分别检查itxt和otxt有无奇葩格式
for i in todo:
    ztxtl[i] = ztxtl[i].replace(itxt[i], otxt[i] + '\n')
    ztxts[i] = ztxtl[i].split('\n')
    tryedit(zpgl[i], ztxtl[i], defaultsummary)

def wtel():
    ntxt = ztxtl[i]
    ntxt = re.sub(r'^([#\*][#\*:;]*)([^ #\*:;])', r'\1 \2', ntxt, flags=re.M)
    ntxt = re.sub(r'==([^=]*)==', lambda m: '==%s==' % m[1].strip(), ntxt)
    ntxt = re.sub(r'\n\n+', '\n', ntxt)
    ntxt = ntxt.replace('==', '\n\n==', 1)
    ntxt = re.sub(r'([^\n])\n==', r'\1\n\n==', ntxt)
    ntxt = ntxt.replace('\n\n==', '\n==', 1)
    ntxt = re.sub(r'\n+----\n', '\n\n----\n', ntxt)
    ntxt = ntxt.strip('\n')
    # 先不处理Also,只处理第一行奇葩格式also
    ntxt = re.sub(r'^: *\{\{also', '{{also', ntxt)
    ztxtl[i] = ntxt
    ztxts[i] = ztxtl[i].split('\n')

def ext_de():
    dast = wtp.parse(dtxtl[i])
    for j, dsec in enumerate(dast.get_sections(level = 2)):
        dsectitle = dsec.title.strip(' ') #不放心
        if not dsectitle.endswith('({{Sprache|Deutsch}})'):
            continue
        dtxtl[i] = dsec.string
        break
    else:
        dtxtl[i] = ''

def add_ipa():
    ipas = subp.run(['grep', '{{IPA}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
    ipal = []
    def ext_ipa(x):
        l = []
        for m in re.finditer(r'\{\{Lautschrift\|([^{}|]*)', x):
            if not '=' in m[1]:
                l.append('[%s]' % m[1])
        if l:
            ipal.append(l)
    for x in ipas.split('\n'):
        ext_ipa(x)
    
    if len(ipal) == 1:
        otxt[i] += '* {{IPA|de|%s}}\n' % '|'.join(ipal[0])
    elif len(ipal) > 1:
        s = set(ipal[0])
        for l in ipal[1:]:
            if set(l) != s:
                break
        else:
            otxt[i] += '* {{IPA|de|%s}}\n' % '|'.join(ipal[0])
            return
        
        print((i, pn[i], "多种IPA"))
        for l in ipal:
            print('* {{IPA|de|%s}}' % '|'.join(l))

def add_hyph():
    hyphs = subp.run(['grep', '--no-group-separator', '-A', '1', '{{Worttrennung}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
    hyphl = []
    for x in hyphs.split('\n'):
        if not '{{Worttrennung}}' in x:
            # 部分德语条目漏加逗号,必须用'{'再次split。暂时不考虑一行两个的情况
            y = x.split('\n')[1].split(',')[0].split('{')[0].strip().replace(':', '').replace('·', '|').replace(' ', '|')
            if y:
                hyphl.append('* {{hyphenation|de|%s}}\n' % y)
    
    if len(hyphl) == 1:
        otxt[i] += hyphl[0]
    elif len(hyphl) > 1:
        s = hyphl[0]
        for x in hyphl[1:]:
            if x != s:
                break
        else:
            otxt[i] += hyphl[0]
            return
        
        print((i, pn[i], "多种hyph"))
        for x in hyphl:
            print(x, end="")


def add_rhyme():
    rhymes = subp.run(['grep', '{{Reime}}'], stdout=subp.PIPE, input=dtxtl[i], encoding='utf8').stdout
    rhymel = []
    def ext_rhyme(x):
        l = []
        for m in re.finditer(r'\{\{Reim\|([^{}|]*)', x):
            if not '=' in m[1]:
                l.append('[%s]' % m[1])
        if l:
            rhymel.append(l)
    for x in rhymes.split('\n'):
        ext_rhyme(x)
    
    if len(rhymel) == 1:
        otxt[i] += '* {{rhymes|de|%s}}\n' % '|'.join(rhymel[0])
    elif len(rhymel) > 1:
        s = set(rhymel[0])
        for l in rhymel[1:]:
            if set(l) != s:
                break
        else:
            otxt[i] += '* {{rhymes|de|%s}}\n' % '|'.join(rhymel[0])
            return
        
        print((i, pn[i], "多种rhyme"))
        for l in rhymel:
            print('* {{rhymes|de|%s}}' % '|'.join(l))

"""
:{{IPA}} {{Lautschrift|dɔɪ̯t͡ʃ}}
:{{Hörbeispiele}} {{Audio|De-Deutsch.ogg}}, {{Audio|De-Deutsch2.ogg}}
:{{Reime}} {{Reim|ɔɪ̯t͡ʃ|Deutsch}}
"""

@fct.total_ordering
class node:
    def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
        if dummy:
            s.kyu = kyu
            s.title = dummy
            return
        s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
        s.l, s.f, s.r = None, None, None
    
    def __bool__(s):
        return isinstance(s.title, str)
    
    def __eq__(x, y):
        return x.kyu == y.kyu
    
    def __lt__(x, y):
        return x.kyu < y.kyu
    
    def __str__(s):
        return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
    
    def __repr__(s):
        return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
    
    @property
    def c(s):
        n1 = s.r
        return n1.a if n1 else n1.title
    
    @property
    def g(s):
        if _l := s.l:
            return _l.a
        else:
            return s.c
    
    def printtree(s, i=0):
        print('  ' * i + str(s))
        if s.l:
            s.l.printtree(i + 1)
        if s.r:
            s.r.printtree(i)    
    
    def gendummy(s):
        if s.r is None:
            s.r = node(kyu=s.kyu, dummy=s.f.c)
        if s.r:
            s.r.gendummy()
        if s.l:
            s.l.gendummy()
    
    def selectson(s, k):
        f = k if callable(k) else lambda x: simp(x.title) == k
        _c = s.l
        ret = []
        while _c:
            if f(_c):
                ret.append(_c)
            _c = _c.r
        if len(ret) == 0:
            raise ValueError((11, i, str(s)))
        if len(ret) > 1:
            raise ValueError((12, i, str(s)))
        return ret[0]

def ztxt_pron():
    # *? for non-greedy
    # =在zhwikt原则上不见于合法标题,其他地方不清楚 
    rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
    def _f1(m):
        # need to check sameness in zhwikt
        return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
    ztxt = ztxtl[i]
    zhl = list(map(_f1, rx1.finditer(ztxt)))
    zrt = node(0, '', 0, 0, '')
    _c = zrt
    # 当h2下先后有h4和h3时出错
    for nd in zhl:
        while nd < _c:
            _c = _c.f
        if nd == _c:
            _c.r = nd
            nd.f = _c.f
            _c = nd
        else:
            if _c.l:
                raise ValueError((14, i))
            _c.l = nd
            nd.f = _c
            _c = nd
    zrt.r = node(dummy=len(ztxt))
    zrt.gendummy()
    zsecnode = zrt.selectson('德语').selectson('发音')
    itxt[i] = ztxt[zsecnode.a:zsecnode.c]

def gen_audiocom():
    def bsearch(a, x):
        i = bisect.bisect_left(a, x)
        if i != len(a) and a[i] == x:
            return i
        return False
    if type(bsearch(audiode, 'De-%s.ogg' % pn_[i])) == int:
        audiocom[i].add(1)
        if type(bsearch(audiode, 'De-%s2.ogg' % pn_[i])) == int:
            audiocom[i].add(2)
    if type(bsearch(audiode, 'De-at-%s.ogg' % pn_[i])) == int:
        audiocom[i].add(5)
        if type(bsearch(audiode, 'De-at-%s2.ogg' % pn_[i])) == int:
            audiocom[i].add(6)
    if type(bsearch(audio188, 'LL-Q188_(deu)-Sebastian_Wallroth-%s.wav' % pn_[i])) == int:
        audiocom[i].add(0)

def cut_audiocom():
    # 本工作最好在一致化语言参数后进行,否则会出现de章节下lang=dde
    for m in re.finditer(r'\{\{audio\|de\|([^{}|]*)', ztxtl[i]):
        bt = m[1].strip().replace('_', ' ')
        bt = bt[0].upper() + bt[1:]
        p = pn[i]
        p1 = '-' + p[0].lower() + p[1:]
        p2 = '-' + p[0].upper() + p[1:]
        bt = bt.replace(p1, '').replace(p2, '')
        bt = re.sub(r'([a-z])\.', r'\g<1>1.', bt)
        bt = bt[:-4] + bt[-4:].lower() # 处理.OGG
        if md := re.match(r'De(\d)\.ogg', bt):
            audiocom[i].discard(int(md[1]))
        elif md := re.match(r'De-at(\d)\.ogg', bt):
            audiocom[i].discard(int(md[1]) + 4)
        elif md := re.match(r'LL-Q188 \(deu\)-Sebastian Wallroth1.wav', bt):
            audiocom[i].discard(0)

def add_audiocom():
    for t in audiocom[i]:
        if t == 0:
            p2 = 'LL-Q188 (deu)-Sebastian Wallroth-%s.wav|音頻(德國)' % pn[i]
        elif 1 <= t <= 4:
            if t == 1:
                t = ''
            p2 = 'De-%s%s.ogg|音頻' % (pn[i], t)
        elif 5 <= t <= 8:
            t -= 4
            if t == 1:
                t = ''
            p2 = 'De-at-%s%s.ogg|音頻(奧地利)' % (pn[i], t)
        otxt[i] += '* {{audio|de|%s}}\n' % p2

def add_pron():
    j1 = its.nth(its.locate(ztxts[i], lambda x: re.match(r'==德[語语]==', x)), 0, -1)
    if j1 == -1:
        return
    
    for j, x in its.islice(enumerate(ztxts[i]), j1, None):
        if re.search(r'[词詞]===$', x):
            # 需要标准化,保证每个L3章节前有空行
            ztxts[i][j - 1] = '\n' + otxt[i]
            ztxtl[i] = '\n'.join(ztxts[i])
            break
    else:
        # 未发现L3章节,可能有多个词源,或格式不规范
        print((i, pn[i], "未发现L3章节"))
        return
    
    ztxts[i] = ztxtl[i].split('\n')
    itxt[i] = otxt_default + '\n'

def verify():
    with open('/tmp/tmpv', 'w') as f:
        for i in todo_add:
            print(('🐮', i, pn[i]), file=f)
            print(otxt[i], file=f)