from prelude import *
from botaccount import *
zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
SLP = 0
focustemp = zh.Pages['Template:IPA']
zpgl = list(focustemp.embed(namespace = 0))
#zpgl = list(zh.categories['有脚本错误的页面'])
pn = list(map(lambda p: p.name, zpgl))
n = len(pn)
#epgl = list(map(getepg, zpgl))
fail, success, purifylog, titlelog, successall = [], [], [], [], []
dbg, dbgout = False, None
fail1, fail2 = [], []
i, j = 0, 0 # j不是global, 没用
defaultsummary = 'Bot: 自enwikt搬运发音章节,修%s' % 'IPA与audio1'
nslist = [0]
s1, s2, s3 = set(), set(), set()
rfapde, customtext, unknown = set(), set(), set()
qualm, accent = set(), set()
audiom = []
@fct.total_ordering
class node:
def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
if dummy:
s.kyu = kyu
s.title = dummy
return
s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
s.l, s.f, s.r = None, None, None
def __bool__(s):
return isinstance(s.title, str)
def __eq__(x, y):
return x.kyu == y.kyu
def __lt__(x, y):
return x.kyu < y.kyu
def __str__(s):
return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
def __repr__(s):
return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
def c(s):
n1 = s.r
return n1.a if n1 else n1.title
def g(s):
if _l := s.l:
return _l.a
else:
return s.c()
def printtree(s, i=0):
print(' ' * i + str(s))
if s.l:
s.l.printtree(i + 1)
if s.r:
s.r.printtree(i)
def gendummy(s):
if s.r is None:
s.r = node(kyu=s.kyu, dummy=s.f.c())
if s.r:
s.r.gendummy()
if s.l:
s.l.gendummy()
def selectson(s, k):
f = k if callable(k) else lambda x: simp(x.title) == k
_c = s.l
ret = []
while _c:
if f(_c):
ret.append(_c)
_c = _c.r
if len(ret) == 0:
raise ValueError((11, i, str(s)))
if len(ret) > 1:
raise ValueError((12, i, str(s)))
return ret[0]
def process1():
# *? for non-greedy
# =在zhwikt原则上不见于合法标题,其他地方不清楚
rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
def _f1(m):
# need to check sameness in zhwikt
return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
epg = en.Pages[pn[i]]
epn = pn[i]
etxt = epg.text()
if etxt == '':
raise ValueError((13, i))
ehl = list(map(_f1, rx1.finditer(etxt)))
ert = node(0, '', 0, 0, '')
_c = ert
# 当h2下先后有h4和h3时出错
for nd in ehl:
while nd < _c:
_c = _c.f
if nd == _c:
_c.r = nd
nd.f = _c.f
_c = nd
else:
if _c.l:
raise ValueError((10, i))
_c.l = nd
nd.f = _c
_c = nd
ert.r = node(dummy=len(etxt))
ert.gendummy()
esecnode = ert.selectson('German').selectson('Pronunciation')
estxt = etxt[esecnode.b:esecnode.c()]
estxt = estxt.replace('{{PAGENAME}}', epn)
def _f2(m):
return '' if m[0] == '' else '\n'
(estxt, _cnt) = re.subn(r'(\n?)' + re.escape('{{rfap|de}}') + r'(\n?)', _f2, estxt)
if _cnt:
rfapde.add(i)
esast = wtp.parse(estxt)
ests = esast.templates
def _f3(l, r):
if l < r and re.search(r'[^ \t\n\*,,.。;;::#]', estxt[l:r]):
customtext.add(i)
lastr = 0
for t in ests:
l, r = t._span_data[:2]
_f3(lastr, l)
lastr = max(lastr, r)
_f3(lastr, len(estxt))
for t in ests:
tn = t.name.lower()
if tn == 'ipa':
tn = 'IPA'
elif tn == 'enpr':
tn = 'enPR'
elif tn in ['rhyme', 'rhymes']:
tn = 'rhymes'
t.name = tn
elif tn in ['hyph', 'hyphenation']:
tn = 'hyphenation'
t.name = tn
elif tn in ['hmp', 'homophone', 'homophones']:
tn = 'homophones'
t.name = tn
elif tn == 'audio':
args = t.arguments
dct = {'': '音频', 'austria': '音频(奥地利)', 'bavaria': '音频(巴伐利亚)', 'bavarian': '音频(巴伐利亚)', 'berlin': '音频(柏林)', 'german': '音频(德国)', 'germany': '音频(德国)'}
if len(args) >= 3 and args[2].positional:
if re.search(re.escape(epn[1:]), args[2].value):
args[2].value = '音频'
else:
try:
args[2].value = dct[args[2].value.lower().wipecl(' ()():', ['audio'])]
except:
audiom.append((i, args[2].value))
elif tn in ['a', 'accent', 'i', 'q', 'qual', 'qualifier']:
if tn in ['i', 'q', 'qual', 'qualifier']:
args = t.arguments
for x in args:
if not x.value in ['overall more common; particularly northern and eastern regions', 'common form in southern Germany, Austria, and Switzerland', 'standard; used naturally in western Germany and Switzerland', 'colloquial; when unstressed by regular shortening, but also used when stressed', 'standard', 'often in fluent speech, not usually in isolation']:
qualm.add(x.value)
else:
accent.add(x.value)
tn = 'a'
t.name = tn
else:
unknown.add(i)
return esast.string
def process2():
# *? for non-greedy
# =在zhwikt原则上不见于合法标题,其他地方不清楚
rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
def _f1(m):
# need to check sameness in zhwikt
return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
zpg = zh.Pages[pn[i]]
zpn = pn[i]
ztxt = zpg.text()
zhl = list(map(_f1, rx1.finditer(ztxt)))
zrt = node(0, '', 0, 0, '')
_c = zrt
# 当h2下先后有h4和h3时出错
for nd in zhl:
while nd < _c:
_c = _c.f
if nd == _c:
_c.r = nd
nd.f = _c.f
_c = nd
else:
if _c.l:
raise ValueError((11, i))
_c.l = nd
nd.f = _c
_c = nd
zrt.r = node(dummy=len(ztxt))
zrt.gendummy()
zsecnode = zrt.selectson('德语').selectson('发音')
zret = [ztxt[0:zsecnode.b], ztxt[zsecnode.c():len(ztxt)], 0, '']
zstxt = ztxt[zsecnode.b:zsecnode.c()]
zstxt = zstxt.replace('{{PAGENAME}}', zpn)
zsast = wtp.parse(zstxt)
zsts = zsast.templates
def _f3(l, r):
if l < r and re.search(r'[^ \t\n\*,,.。;;::#]', simp(zstxt[l:r]).wipel(['奥地利', '标准', '德语', '发音', '男', '女'])):
zret[2] = 1
lastr = 0
for t in zsts:
l, r = t._span_data[:2]
_f3(lastr, l)
lastr = max(lastr, r)
_f3(lastr, len(zstxt))
if zret[2]:
zret[3] = zstxt
return zret
for t in zsts:
tn = t.name.lower()
if tn in ['ipa', 'ipa1', 'ipa4']:
pass
elif tn == 'enpr':
pass
elif tn in ['rhyme', 'rhymes']:
pass
elif tn in ['hyph', 'hyphenation']:
pass
elif tn in ['hmp', 'homophone', 'homophones']:
pass
elif tn in ['audio', 'audio1']:
pass
elif tn in ['a', 'accent', 'i', 'q', 'qual', 'qualifier']:
pass
else:
zret[2] = 2
if zret[2]:
zret[3] = zstxt
return zret
def process():
try:
bb = process1()
except ValueError as e:
fail.append(e)
return
try:
aa, cc, err, payload = process2()
except ValueError as e:
_a, _d = e.args[0][0], e.args[0][1:]
fail.append((_a + 10,) + _d)
tryedit(zh.Pages['User:Qnm/epg-on/%s' % pn[i]], bb)
return
if err:
tryedit(zh.Pages['User:Qnm/diff/%s' % pn[i]], bb + "\n\n🐝👨🏻🐮🍺\n\n" + payload)
tryedit(zh.Pages['User:Qnm/zpg/%s' % pn[i]], aa + bb + cc)
return
tryedit(zh.Pages[pn[i]], aa + bb + cc, summary=defaultsummary, fail=fail)