User:CrowleyBot/task/19
数据
编辑- 俄语第一批,涉ru-nm, ru-vb, ru-aj, ru-av
技术细节
编辑- 需要处理head模板为verb/noun form的部分
- 需要翻译notes之类
- 存在标题行为“专有名词”但章节为“名词”的情况,需要手工修复
代码
编辑nm、pm、aj、av、vb的获取与User:CrowleyBot/task/18类似。
from prelude import *
from botaccount import *
zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
en = mwc.Site('en.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)
en.login(UN, PWD)
@fct.total_ordering
class node:
def __init__(s, kyu=0, title='', a=0, b=0, z='', dummy = 0):
if dummy:
s.kyu = kyu
s.title = dummy
return
s.kyu, s.title, s.a, s.b, s.z = kyu, title, a, b, z
s.l, s.f, s.r = None, None, None
def __bool__(s):
return isinstance(s.title, str)
def __eq__(x, y):
return x.kyu == y.kyu
def __lt__(x, y):
return x.kyu < y.kyu
def __str__(s):
return "%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu)
def __repr__(s):
return "%s%s%s%s" % ('=' * s.kyu, s.title, '=' * s.kyu, s.z)
@property
def c(s):
n1 = s.r
return n1.a if n1 else n1.title
@property
def g(s):
if _l := s.l:
return _l.a
else:
return s.c
def printtree(s, i=0):
print(' ' * i + str(s))
if s.l:
s.l.printtree(i + 1)
if s.r:
s.r.printtree(i)
def gendummy(s):
if s.r is None:
s.r = node(kyu=s.kyu, dummy=s.f.c)
if s.r:
s.r.gendummy()
if s.l:
s.l.gendummy()
def selectson(s, k):
f = k if callable(k) else lambda x: simp(x.title) == simp(k)
_c = s.l
ret = []
while _c:
if f(_c):
ret.append(_c)
_c = _c.r
if len(ret) == 0:
raise ValueError((11, i, str(s)))
if len(ret) > 1:
raise ValueError((12, i, str(s)))
return ret[0]
def process2():
# *? for non-greedy
# =在zhwikt原则上不见于合法标题,其他地方不清楚
rx1 = re.compile(r'^(===*)\s*([^=]*?)\s*(===*)\s*?(\n+)', flags=re.MULTILINE)
def _f1(m):
# need to check sameness in zhwikt
return node(min(len(m[1]), len(m[3])), m[2], m.start(), m.end(), m[4])
#epg = en.Pages[pn[i]]
epn = pn[i]
etxt = todo[i]
if etxt == '':
#raise ValueError((13, i))
return
ehl = list(map(_f1, rx1.finditer(etxt)))
ert = node(0, '', 0, 0, '')
_c = ert
# 当h2下先后有h4和h3时出错
for nd in ehl:
while nd < _c:
_c = _c.f
if nd == _c:
_c.r = nd
nd.f = _c.f
_c = nd
else:
if _c.l:
raise ValueError((10, i))
_c.l = nd
nd.f = _c
_c = nd
ert.r = node(dummy=len(etxt))
ert.gendummy()
ps = ''
sn = simp('變格')
if todo is nm:
ps = 'nm'
elif todo is pm:
ps = 'nm'
elif todo is aj:
ps = 'aj'
elif todo is av:
ps = 'av'
else:
ps = 'vb'
sn = simp('變位')
_l = ert.l.a
if ert.l.l:
_r = ert.l.l.a
else:
_r = ert.l.c
after1 = '\n'.join(its.takewhile(lambda x: not x.startswith('#'), etxt[_l:_r].split('\n')))
after1 = after1.replace('|adjective form', '|形容詞變格形').replace('|verb form', '|動詞變位形式')
after2 = ''
try:
h4 = ert.l.selectson(sn)
after2 = etxt[h4.a:h4.c]
after2 = after2.replace('\n----', '').strip(' \n')
except:
pass
ztxtl[i] = re.sub(r'\{\{ru-%s\|[^{}]*\}\}' % ps, after1, ztxtl[i])
ztxtl[i] = re.sub(r'\{\{ru-%s-[^{}]*\}\}' % ps, after2, ztxtl[i])
if '|note' in ztxtl[i] or 'meanings' in ztxtl[i]:
print((i, pn[i]))
todo=nm
for i in range(n): process2()
todo=pm
for i in range(n): process2()
todo=aj
for i in range(n): process2()
todo=av
for i in range(n): process2()
todo=vb
for i in range(n): process2()
ztxtl[204]=ztxtl[204].replace("''in the meanings \"healthy, wholesome\"'':", "義項“健康的”:")
ztxtl[204]=ztxtl[204].replace("''in the meanings \"strong, big\"'':", "義項“強壯的,大的”:")
ztxtl[266]=ztxtl[266].replace("* Dated.", "* 舊")
ztxtl[167]=ztxtl[167].replace("* Poetic.", "* 詩歌")
ztxtl[303]=ztxtl[303].replace('* The alternative stress patterns in the singular "мо́ста", "мо́сту", "мо́стом", "мо́сте" are less common.', '* 在單數"мо́ста", "мо́сту", "мо́стом", "мо́сте"中,另一種重讀形式不常見。')
ztxtl[500]=ztxtl[500].replace("* Uneducated.", "* 未受敎育的")
for i in range(n):
if re.search(r'\n(#[^: ][^\n]*)', ztxtl[i]):
ztxtl[i]=re.sub(r'\n#([^: ][^\n]*)', r'\n# \1', ztxtl[i])
for i in range(n):
if re.search(r'\n(#:+)([^ ][^\n]*)', ztxtl[i]):
ztxtl[i]=re.sub(r'\n(#:+)([^ ][^\n]*)', r'\n\1 \2', ztxtl[i])
for i in range(n):
if re.search(r'[^\n]\n====變', ztxtl[i]):
ztxtl[i]=re.sub(r'([^\n])\n====變', r'\1\n\n====變', ztxtl[i])
for i in range(n):
if ztxtl[i] != ztxtlb[i]:
tryedit(zh.Pages[pn[i]], ztxtl[i], 'Bot: [[User:CrowleyBot/task/19|为俄语条目搬运标题行]],ru-{nm,aj,av,vb}')