模組:Zh-sortkey/sandbox
- 下列說明文檔位於Module:Zh-sortkey/sandbox/doc。[編輯]
- 相關連結:根頁面 • 根頁面的子頁面 • 本頁面的子頁面 • 链入 • 嵌入包含 • 測試用例 • sandbox of (差異)
本模塊用於開發Module:Zh-sortkey中的表意文字描述序列檢測功能。
- ⿰亻革 (
人09
) - ⿰亻革家 (
人09宀07
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心/⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心 (
辵54
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿱苦⿲⿰⿹耳舌鼻⿱⿱平⿰惡意⿱眼⿰淨染⿰⿱女子身 (
辵54⿱艸05⿲⿰⿹耳00舌00鼻00⿱⿱干02⿰心08心09⿱目06⿰水08木05⿰⿱女00子00身00
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵/⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心面 (
辵54辵54麥09
)
假設的缺陷示例:
- ⿰石 (
⿰石00
) - ⿰石⿳女子 (
⿰石00⿳女00子00
) - ⿳女子 (
⿳女00子00
)
local export = {}
local namespace = mw.title.getCurrentTitle().nsText
local function log(...)
if namespace == "Module" then
mw.log(...)
end
end
--[[
The number of characters or ideographic sequences that must follow each
ideographic description character.
]]
local IDchars = {
["⿰"] = 2,
["⿱"] = 2,
["⿲"] = 3,
["⿳"] = 3,
["⿴"] = 2,
["⿵"] = 2,
["⿶"] = 2,
["⿷"] = 2,
["⿸"] = 2,
["⿹"] = 2,
["⿺"] = 2,
["⿻"] = 2,
}
local function UTF8Array(text)
local array = {}
local i = 0
for character in string.gmatch(text, '[%z\1-\127\194-\244][\128-\191]*') do
i = i + 1
array[i] = character
end
return array
end
local function subconcat(array, i, j)
return table.concat{ unpack(array, i, j) }
end
--[[
Returns the index in the string where the ideographic description sequence
(IDS) ends, or the index of the end of the string. Iterates whenever
another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(characterArray, IDchar, i)
if not ( characterArray and IDchar and i) then
return nil
end
local j = i
local component = 1
-- Number of components expected after current IDC.
local components = IDchars[IDchar]
while component <= components do
j = j + 1
local char = characterArray[j]
if not char then
break
elseif IDchars[char] then
j = findEndOfIDS(characterArray, char, j)
end
component = component + 1
end
--[[
If the expected number of components has been found,
return the current index in the text.
]]
if component - components == 1 then
return j
else
return nil
end
end
local function getFromModule(codepoint, start, returnModule)
--[=[
The sortkey modules handle two sets of codepoints.
The first set runs from [[Module:zh-sortkey/data/001]]
to [[Module:zh-sortkey/data/056]], then there is a gap
of 90134 codepoints. The second set runs from
[[Module:zh-sortkey/data/057]] to
[[Module:zh-sortkey/data/177]].
]=]
local moduleStart = {
[13312] = 1,
[131072] = 57,
}
local moduleName = string.format(
"Module:zh-sortkey/data/%03d",
( codepoint - start ) / 500 + moduleStart[start]
)
-- log(codepoint .. ": data module: " .. moduleName)
if returnModule then
return moduleName
else
local success, data = pcall(mw.loadData, moduleName)
if success then
-- log("success! ... " .. codepoint .. ": " .. tostring(data[codepoint]))
return data[codepoint]
else
-- log("failure: " .. codepoint .. " (" .. mw.ustring.char(codepoint) .. ")")
return nil
end
end
end
function export.getData(char, returnModule)
if type(char) == "string" then
char = mw.ustring.codepoint(char)
elseif type(char) ~= number then
error("getData must operate on a single character or codepoint.")
end
-- log(char, mw.ustring.char(char))
if char >= 13312 and char <= 40938 then
return getFromModule(char, 13312, returnModule)
elseif char >= 131072 and char <= 191456 then
return getFromModule(char, 131072, returnModule)
else
-- log("not in range: " .. char .. " (" .. mw.ustring.char(char) .. ")")
end
return nil
end
function export.makeSortKey(text, lang, sc)
local allowed_langs = {
zh = true,
vi = true,
ja = true,
}
if lang and not allowed_langs[lang] then
return text
end
if sc and sc ~= "Hani" then
return text
end
local sort = {}
local characterArray = UTF8Array(text)
local length = #characterArray
if length == 1 then
local character = array[1]
return export.getData(character) or character
end
local i = 1
while i <= length do
local character = characterArray[i]
--[=[
If we encounter an ideographic description character (IDC,
find out if it begins a valid ideographic description sequence (IDS).
If the IDS is valid and a sortkey for it is listed in
[[Module:zh-sortkey/data/unsupported]], then return
the sortkey, and move to the next character after the
IDS.
Otherwise, ignore the IDC and move to the next character
after it.
If the IDS is valid and no sortkey for it is found, track it.
]=]
if IDchars[character] then
local j = findEndOfIDS(characterArray, character, i)
local IDS, data
if j then
IDS = subconcat(characterArray, i, j)
data = mw.loadData("Module:zh-sortkey/data/unsupported")[IDS]
end
if IDS and not data then
require("Module:debug").track("zh-sortkey/IDS-without-sortkey")
mw.log("ideographic description sequence without sortkey: " .. IDS)
end
if IDS and data then
table.insert(sort, data)
i = j
else
table.insert(sort, character)
end
else
table.insert(sort, export.getData(character) or character)
end
i = i + 1
end
sort = table.concat(sort)
return sort
end
return export