技术细节 编辑

  • 以下代码,先获取问题页面标题,再获取问题行及其前10行供人工决定等号的加减,再合并
grep --no-group-separator -n -E -e '^(=+)[^=]+\1=$' -e '<title>' zhwiktionary-20210820-pages-articles.xml | grep --no-group-separator -B 1 -E '^[0-9]*:=' | grep -v -E '^[0-9]*:=' > /tmp/tmp1
grep --no-group-separator -n -B 10 -E -e '^(=+)[^=]+\1=$' zhwiktionary-20210820-pages-articles.xml > /tmp/tmp2
cat /tmp/tmp1 /tmp/tmp2 | sort -n > /tmp/tmp3
  • 对于标题含等号的情况:(未找到)
grep --no-group-separator -n -E -e '^(=+)[^=].*=.*[^=]\1=$' -e '<title>' zhwiktionary-20210820-pages-articles.xml | grep --no-group-separator -B 1 -E '^[0-9]*:=' | grep -v -E '^[0-9]*:=' > /tmp/tmp1
grep --no-group-separator -n -B 10 -E -e '^(=+)[^=].*=.*[^=]\1=$' zhwiktionary-20210820-pages-articles.xml > /tmp/tmp2
cat /tmp/tmp1 /tmp/tmp2 | sort -n > /tmp/tmp3

代码 编辑

from prelude import *
from botaccount import *

zh = mwc.Site('zh.wiktionary.org', clients_useragent = UA)
zh.login(UN, PWD)

def process():
    zpg = zh.Pages[pn[i]]
    ztxt = zpg.text()
    ntxt = re.sub(r'^(=+)([^=]+)\1=$', r'\1\2\1', ztxt, flags=re.MULTILINE)
    #ntxt = ntxt.replace('__NOEDITSECTION__\n', '')
    if ntxt != ztxt:
        tryedit(zpg, ntxt, 'Bot: [[User:CrowleyBot/task/20|修复章节标题两侧等号不同]]')
    else:
        print((i, pn[i]))