local export = {}
local lang = require("Module:languages").getByCode("ca")
local m_IPA = require("Module:IPA")
local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities"
local strutil_module = "Module:string utilities"
local listToSet = require("Module:table").listToSet
--[=[
FIXME:
1. [zʒ] should reduce to [ʒ] in Central and Balearic ([[disjunt]], [[disjuntor]]). Similar for [sʃ]
([[desxifrar]]). [DONE]
2. There needs to be a way of forcing [ʃ]. (Maybe just ʃ?) [DONE]
3. Make sure manual dot for syllable break works, cf. [[best-seller]] respelled `bèst.sèlerr'. [DONE]
4. Explicit accents on a/i/u should be removed in split_syllables(). [NOT DONE; not needed]
5. Compress double schwa in Central/Balearic in e.g. [[sobreescalfament]], [[centreafricà]], [[contraatac]],
[[contraescarpa]]; seems not to operate in Valencian.
6. Compress unstressed <ie> and <oe> followed by coda consonant -> [u] in Central/Balearic in e.g. [[aeroespacial]],
[[autoescola]], [[antiespasmòdic]], but not [[autoerotisme]], [[antiemètic]]; seems not to operate in Valencian.
NOTE: It does operate in an open syllable in [[fotoelèctric]], [[fotoelectricitat]], [[macroeconomia]]; not sure why.
7. Compress unstressed <oo> followed by a coda consonant -> [u] in Central/Balearic in e.g. [[microorganisme]]. Seems
not to operate in Valencian.
8. bm -> [mm] e.g. [[subministrament]]; seems not to operate in Valencian.
9. ë (and presumably ê) doesn't work in secondary stress, always becomes /ɛ/ (e.g. in [[extrajudicial]] respelled
'ëxtrajudiciàl'; this seems to be because the handling of ë goes through mid_vowel_hint, which doesn't work for
secondary stress. [DONE]
10. Respect ʃ at beginning of word in Valencian. [DONE]
11. [ʃ] in single substitution specs should match against written x. [DONE]
12. Prefixes e.g. [[xilo-]] should not have stress by default, and written primary stresses should be converted to
secondary. [DONE]
13. Convert apostrophe near beginning to tie (‿) and make sure we take account of it later, so that words like
[[captindre's]] and phrases like [[dona d'aigua]] work correctly. [DONE]
14. Correctly handle -bl and -gl in respelling, generating [bl] and [gl]. [NOT DONE; use _bl, _gl]
15. Correctly handle [βðɣ] in respelling forcing fricatives; should not be fortitioned. [NOT DONE; not needed]
16. [βðɣ] in single substitution specs should match against b/d/g. [NOT DONE; not needed]
17. [ss] in single substitution specs should match against ss?; used to force a pronounced [s]. [DONE]
18. [dm] in single substitution specs should match against [td]m. [NOT DONE; not needed]
19. Correctly handle written -dg- after [rz]: fricatives in Valencian, stops in Central (and Balearic?). [DONE]
20. Correctly handle lenition of written -bdg-: (1) -b- not lenited in Valencian or Balearic, lenited to [β] in
Central Catalan after vowels and consonants except nasals and [rz]; (2) -g- not lenited after nasals, also not
after [rz] in Central Catalan (and maybe Balearic?), otherwise yes except utterance initial; (3) -d- not lenited
after nasals or laterals, also not after [rz] in Central Catalan (and maybe Balearic?), otherwise yes except
utterance initial. Verify against ca-IPA equivalent on cawikt and also based on {{w|Catalan phonology}} and the IEC
grammar that Vriullop linked. [DONE]
21. Finish rewriting do_dialect_specific() to operate on whole word using Lua patterns. [DONE]
22. Implement multiword handling. [DONE]
23. Make sure suffix handling works correctly. [DONE]
24. Add many more test cases and redo test harness ala the German test harness. [DONE]
25. Redo handling of mid-vowel hints so it gets done early and in one place. [DONE]
26. Think about how to solve the issue of mid-vowel hints along with secondary stress marks in substitution specs.
Maybe a single mid-vowel spec should be rewritten to be a single substitution spec and the insertion of the
mid-vowel spec should happen during resolution of substitution specs. [DONE]
27. <tm> should default to [dm] not [mm]. [DONE]
28. Fix handling of mid vowel default in -è/-ès/-esa so it doesn't affect [[tèbia]] etc. [DONE]
29. x- after hyphen should probably become tx- in Valencian, cf. [[para-xocs]]. [DONE]
30. Implement DOTOVER to indicate lack of stress in a word, e.g. in a suffix. [DONE]
31. Handle words without vowels. [DONE]
32. Finish reviewing places where we may need to check for tie symbols.
33. Handle tie indicating liaison in e.g. [[Sant Antoni de Portmany]]. [DONE]
34. Handle pronunciation of [[amb]] correctly. [DONE]
35. Handle tie indicating liaison before h- correctly, e.g. [[Sant Hipòlit]]. [DONE]
36. Lenition should happen in Valencian in [[regla]] whether respelled 'réggla', 'régla' or 'rég_la'. [DONE]
37. Syllabification should happen correctly when underscore is used in 'bíb_lia' to block doubling of <bl>. [DONE]
38. <cn> should show up as [ŋn]. [DONE]
39. Delete [t] after [s] before anything but [s] ([[best-seller]]) or [ɾ] ([[postrem]]). [DONE]
40. Delete <t/d> after <n> before consonant even in Valencian; likewise for <p/b> after <m>, <c/g> after <n>. [DONE]
41. DOTOVER in single substitution specs should work. [DONE]
42. Underline in single substitution specs should work. [DONE]
43. LINEUNDER should work to indicate secondary stress after the primary stress, including in single substitution
specs. [DONE]
]=]
local usub = mw.ustring.sub
local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local rsubn = mw.ustring.gsub
local ulower = mw.ustring.lower
local u = require("Module:string/char")
local ugcodepoint = mw.ustring.gcodepoint
export.dialects = {"bal", "cen", "val"}
export.dialects_to_names = {
bal = "巴利阿里語",
cen = "中加泰羅尼亞語",
val = "瓦倫西亞語",
}
export.dialect_groups = {
east = {"bal", "cen"},
}
local written_unaccented_vowel_l = "aeiouyAEIOUY"
local written_stressed_vowel_l = "àèéêëíòóôúýÀÈÉÊËÍÒÓÔÚÝ"
local written_accented_not_stressed_vowel_l = "ïüÏÜ"
local written_accented_vowel_l = written_stressed_vowel_l .. written_accented_not_stressed_vowel_l
local ipa_vowel_l = "ɔɛə"
local written_vowel_l = written_unaccented_vowel_l .. written_accented_vowel_l
local vowel_l = written_vowel_l .. ipa_vowel_l
local V = "[" .. vowel_l .. "]"
local written_accented_to_plain_vowel = {
["à"] = "a",
["è"] = "e",
["é"] = "e",
["ê"] = "e",
["ë"] = "e",
["í"] = "i",
["ï"] = "i",
["ò"] = "o",
["ó"] = "o",
["ô"] = "o",
["ú"] = "u",
["ü"] = "u",
["ý"] = "y",
["À"] = "A",
["È"] = "E",
["É"] = "E",
["Ê"] = "E",
["Ë"] = "E",
["Í"] = "I",
["Ï"] = "I",
["Ò"] = "O",
["Ó"] = "O",
["Ô"] = "O",
["Ú"] = "U",
["Ü"] = "U",
["Ý"] = "Y",
}
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local DOTOVER = u(0x0307) -- dot over = ̇
local DIA = u(0x0308) -- diaeresis = ̈
local LINEUNDER = u(0x0331) -- lineunder = ̱
local stress_l = AC .. GR
local stress_c = "[" .. stress_l .. "]"
local ipa_stress_l = "ˈˌ"
local ipa_stress_c = "[" .. ipa_stress_l .. "]"
local sylsep_l = "%-." -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV
local sylsep_c = "[" .. sylsep_l .. "]"
local tie_l = "‿'"
local tie_c = "[" .. tie_l .. "]"
local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l
local charsep_c = "[" .. charsep_l .. "]"
local wordsep_l = "# "
local wordsep_c = "[" .. wordsep_l .. "]"
local separator_l = charsep_l .. wordsep_l
local separator_c = "[" .. separator_l .. "]"
local neg_guts_of_cons = vowel_l .. separator_l
local C = "[^" .. neg_guts_of_cons .. "]" -- consonant class including h
export.mid_vowel_hints = "éèêëóòô"
export.mid_vowel_hint_c = "[" .. export.mid_vowel_hints .. "]"
local TEMP_PAREN_R = u(0xFFF1)
local TEMP_PAREN_RR = u(0xFFF2)
-- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently
-- used.
local PSEUDOCONS = u(0xFFF3)
-- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents
local valid_onsets = listToSet {
"b", "bl", "br",
"c", "cl", "cr",
"ç",
"d", "dj", "dr",
"f", "fl", "fr",
"g", "gl", "gr", "gu", "gü",
"h",
"i",
"j",
"k", "kl", "kr",
"l", "ll",
"m",
"n", "ny", "ñ",
"p", "pl", "pr",
"qu", "qü",
"r", "rr",
"s", "ss",
"t", "tg", "tj", "tr", "tx", "tz",
"u",
"v", "vl", "vr",
"w",
"x",
"ʃ", -- e.g. 'χruʃóf' respelling of [[Khrusxov]]
"χ", -- in case of respelling
"y",
"z",
}
local decompose_dotover = {
-- No composed i, u or U with DOTOVER.
["ȧ"] = "a" .. DOTOVER,
["ė"] = "e" .. DOTOVER,
["ȯ"] = "o" .. DOTOVER,
["ẏ"] = "y" .. DOTOVER,
["Ȧ"] = "A" .. DOTOVER,
["Ė"] = "E" .. DOTOVER,
["İ"] = "I" .. DOTOVER,
["Ȯ"] = "O" .. DOTOVER,
["Ẏ"] = "Y" .. DOTOVER,
}
local unstressed_words = listToSet {
-- proclitic object pronouns
"em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en",
-- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. [[tant me fa]]
"me", "te", "se", "lo", "los", "nos", "vos", "ne",
-- contracted object pronouns and articles attached with apostrophe so no need to include
-- unstressed possessives
"mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses",
-- prepositions
"a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns
-- prepositional contractions
"al", "als", "del", "dels", "pel", "pels",
-- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns
-- personal articles
"na", -- 'en' already included above
-- indefinite articles
"un", "uns",
-- salat articles
"ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives
-- conjunctions
"i", "o", "si", "ni", "que",
}
-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- Version of rsubn() that returns a 2nd argument boolean indicating whether a substitution was made.
local function rsubb(term, foo, bar)
local retval, nsubs = rsubn(term, foo, bar)
return retval, nsubs > 0
end
-- Apply rsub() repeatedly until no change.
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local function split_into_chars(text)
local chars = {}
for codepoint in ugcodepoint(text) do
table.insert(chars, u(codepoint))
end
return chars
end
local function split_on_comma(term)
if term:find(",%s") or term:find("\\") then
return require(parse_utilities_module).split_on_comma(term)
else
return rsplit(term, ",")
end
end
local function concat_keys(tab)
local res = {}
for k, _ in pairs(tab) do
table.insert(res, k)
end
return table.concat(res)
end
local function handle_unstressed_words(words)
words = m_table.deepCopy(words)
-- Lowercase all words for ease in further processing.
for i, wordobj in ipairs(words) do
wordobj.term = ulower(wordobj.term)
end
-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel.
local function is_amb_to_join(words, i)
return i < #words and words[i].term == "a" .. DOTOVER .. "mb" and rfind(words[i + 1].term, "^h?" .. V)
end
local saw_amb_to_join = true
-- Mark all unstressed words with DOTOVER, so that split_syllables() doesn't assign stress. We need to do this
-- before special handling for [[amb]], because [[amb]] may join to another unstressed word like [[el]], in the
-- process losing the identity of the two words. In the process, see if [[amb]] occurs before a following
-- vowel-initial word (which may begin with h-).
for i, wordobj in ipairs(words) do
-- Put DOTOVER after the last vowel (to handle the case of [[que]]). It doesn't actually matter where we put
-- it, because split_syllables() just looks for DOTOVER anywhere in the word.
if unstressed_words[wordobj.term] then
wordobj.term = rsub(wordobj.term, "^(.*" .. V .. ")", "%1" .. DOTOVER)
end
if is_amb_to_join(words, i) then
saw_amb_to_join = true
end
end
-- Join [[amb]] before vowel-initial word with following word.
if saw_amb_to_join then
local new_words = {}
local i = 1
while i <= #words do
if is_amb_to_join(words, i) then
table.insert(new_words, {term = words[i].term .. "‿" .. words[i + 1].term, pos = words[i + 1].pos})
i = i + 2
else
table.insert(new_words, words[i])
i = i + 1
end
end
words = new_words
end
-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining [[amb]] not before a
-- vowel-initial word is pronounced [am] even in Valencian (where [amp]/[amb] would be expected), and [[per]] always
-- has a pronounced <r>.
local unstressed_word_replacement = {
["a" .. DOTOVER .. "mb"] = "a" .. DOTOVER .. "m",
["pe" .. DOTOVER .. "r"] = "pe" .. DOTOVER .. "rr",
}
for i, wordobj in ipairs(words) do
wordobj.term = unstressed_word_replacement[wordobj.term] or wordobj.term
end
return words
end
local function fix_prefixes(word)
-- Voiced s in prefix roots -fons-, -dins-, -trans-
word = rsub(word, "^enfons([aàeèéiíoòóuú])", "enfonz%1")
word = rsub(word, "^endins([aàeèéiíoòóuú])", "endinz%1")
word = rsub(word, "tr([aà])ns([aàeèéiíoòóuúbdghlmv])", "tr%1nz%2")
-- in + ex > ineks/inegz
word = rsub(word, "^inex", "in.ex")
return word
end
local function restore_diaereses(word)
-- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus
word = rsub(word, "([iu])um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)
word = rsub(word, "([aeiou])isme(s?)$", "%1ísme%2") -- suffix -isme
word = rsub(word, "([aeiou])ist([ae]s?)$", "%1íst%2") -- suffix -ista
word = rsub(word, "([aeou])ir$", "%1ír") -- verbs -ir
word = rsub(word, "([aeou])int$", "%1ínt") -- present participle
word = rsub(word, "([aeo])ir([éà])$", "%1ïr%2") -- future
word = rsub(word, "([^gq]u)ir([éà])$", "%1ïr%2")
word = rsub(word, "([aeo])iràs$", "%1ïràs")
word = rsub(word, "([^gq]u)iràs$", "%1ïràs")
word = rsub(word, "([aeo])ir(e[mu])$", "%1ïr%2")
word = rsub(word, "([^gq]u)ir(e[mu])$", "%1ïr%2")
word = rsub(word, "([aeo])iran$", "%1ïran")
word = rsub(word, "([^gq]u)iran$", "%1ïran")
word = rsub(word, "([aeo])iria$", "%1ïria") -- conditional
word = rsub(word, "([^gq]u)iria$", "%1ïria")
word = rsub(word, "([aeo])ir(ie[sn])$", "%1ïr%2")
word = rsub(word, "([^gq]u)ir(ie[sn])$", "%1ïr%2")
return word
end
local function fix_y(word)
-- y > vowel i else consonant /j/, except ny
word = rsub(word, "ny", "ñ")
word = rsub(word, "y([^aeiouàèéêëíòóôúïü])", "i%1") -- vowel if not next to another vowel
word = rsub(word, "([^aeiouàèéêëíòóôúïü·%-%.])y", "%1i") -- excluding also syllables separators
return word
end
local function mid_vowel_fixes(word)
local function track_mid_vowel(vowel, cont)
require("Module:debug/track"){"ca-IPA/" .. vowel, "ca-IPA/" .. vowel .. "/" .. cont}
return true
end
local changed
-- final -el (not -ell) usually è but not too many cases
word, changed = rsubb(word, "e(nts?)$", "é%1")
if changed then
track_mid_vowel("e", "nt-nts")
end
word, changed = rsubb(word, "e(rs?)$", "é%1")
if changed then
track_mid_vowel("e", "r-rs")
end
word, changed = rsubb(word, "o(rs?)$", "ó%1")
if changed then
track_mid_vowel("o", "r-rs")
end
word, changed = rsubb(word, "è(s?)$", "ê%1")
if changed then
track_mid_vowel("è", "s-blank")
end
word, changed = rsubb(word, "e(s[oe]s)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
end
word, changed = rsubb(word, "e(sa)$", "ê%1")
if changed then
track_mid_vowel("e", "sos-sa-ses")
end
return word
end
local function word_fixes(word, dialect)
word = rsub(word, "%(rr%)", TEMP_PAREN_RR)
word = rsub(word, "%(r%)", TEMP_PAREN_R)
word = rsub(word, "%-([rs]?)", "-%1%1")
if dialect == "val" then
word = rsub(word, "%-x", "-tx")
end
word = rsub(word, "rç$", "rrs") -- silent r only in plurals -rs
word = fix_prefixes(word) -- internal pause after a prefix
word = restore_diaereses(word) -- no diaeresis saving
word = fix_y(word) -- ny > ñ else y > i vowel or consonant
word = mid_vowel_fixes(word)
-- all words in pn- (e.g. [[pneumotòrax]] and mn- (e.g. [[mnemònic]]) have silent p/m in both Central and Valencian
word = rsub(word, "^[pm]n", "n")
-- Respell ch + vowel as tx, before we remove other h's after consonants.
word = rsub(word, "ch(" .. V ..")", "tx%1")
-- Delete h after a consonant. This must happen here, before split_syllables(). We don't delete h after a vowel
-- yet because it indicates a hiatus.
word = rsub(word, "(" .. C .. ")h", "%1")
return word
end
local function split_vowels(vowels, saw_dotover, saw_lineunder)
local syllables = {{onset = "", vowel = usub(vowels, 1, 1), coda = "", separator = "", has_dotover = saw_dotover,
has_lineunder = saw_lineunder}}
vowels = usub(vowels, 2)
while vowels ~= "" do
local syll = {onset = "", vowel = "", coda = ""}
syll.onset, syll.vowel, vowels = rmatch(vowels, "^([iu]?)(.)(.-)$")
table.insert(syllables, syll)
end
local count = #syllables
if count >= 2 and (syllables[count].vowel == "i" or syllables[count].vowel == "u") then
syllables[count - 1].coda = syllables[count].vowel
syllables[count] = nil
end
return syllables
end
-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`,
-- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and
-- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the
-- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-').
-- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be
-- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms.
local function split_syllables(word, stress_prefixes, may_be_uppercase)
local syllables = {}
local saw_dotover = false
local remainder = word
local is_prefix = false
if remainder:find("%-$") then -- prefix
is_prefix = true
remainder = remainder:gsub("%-$", "")
end
local is_suffix = false
if remainder:find("^%-") then -- suffix
is_suffix = true
remainder = remainder:gsub("^%-", "")
end
while remainder ~= "" do
local consonants, vowels
-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY.
-- Not sure why.
local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or
"aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER
consonants, remainder = rmatch(remainder, "^([^" .. vowel_list .. "]*)(.-)$")
vowels, remainder = rmatch(remainder, "^([" .. vowel_list .. "]*)(.-)$")
local this_saw_dotover = not not rfind(vowels, DOTOVER)
if this_saw_dotover then
saw_dotover = true
vowels = vowels:gsub(DOTOVER, "")
end
local this_saw_lineunder = not not rfind(vowels, LINEUNDER)
if this_saw_lineunder then
vowels = vowels:gsub(LINEUNDER, "")
end
if vowels == "" then
if #syllables > 0 then
syllables[#syllables].coda = syllables[#syllables].coda .. consonants
else
-- word without vowels, e.g. foot boundary |
table.insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""})
end
else
local onset = consonants
local first_vowel = usub(vowels, 1, 1)
if (rfind(onset, "[gqGQ]$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u")))
or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and
(first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I"))
then
onset = onset .. usub(vowels, 1, 1)
vowels = usub(vowels, 2)
end
local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder)
vsyllables[1].onset = onset .. vsyllables[1].onset
for _, s in ipairs(vsyllables) do
table.insert(syllables, s)
end
end
end
-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid
for i = 2, #syllables do
local current = syllables[i]
local previous = syllables[i-1]
while not (current.onset == "" or valid_onsets[rsub(rsub(current.onset, tie_c .. "[hH]?$", ""), "_", "")]) do
local letter = usub(current.onset, 1, 1)
current.onset = usub(current.onset, 2)
if rfind(letter, "[·%-%.]") then -- syllable separators
current.separator = letter
break
else
previous.coda = previous.coda .. letter
if rfind(letter, tie_c) then
break
end
end
end
end
-- Detect stress
for i, syll in ipairs(syllables) do
if rfind(syll.vowel, "^[" .. written_stressed_vowel_l .. "]$") then
syll.stressed = true
-- primary stress: the last one stressed without LINEUNDER
if not syll.has_lineunder then
syllables.stress = i
end
end
end
-- Assign default stress
if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then
local count = #syllables
if count == 1 then
if syllables[1].vowel ~= "" then -- vowel-less words don't get stress
syllables.stress = 1
end
else
local final = syllables[count]
-- Take account of tie symbols (apostrophes and ‿).
if rfind(final.coda, "^[s" .. tie_l .. "]*$") or (rfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and (
final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then
syllables.stress = count - 1
else
syllables.stress = count
end
end
if syllables.stress then
syllables[syllables.stress].stressed = true
end
end
syllables.is_prefix = is_prefix
syllables.is_suffix = is_suffix
return syllables
end
local function reconstitute_word_from_syllables(syllables)
local parts = {}
local function ins(txt)
table.insert(parts, txt)
end
if syllables.is_suffix then
ins("-")
end
for _, syl in ipairs(syllables) do
ins(syl.separator)
ins(syl.onset)
ins(syl.vowel)
if syl.has_dotover then
ins(DOTOVER)
end
if syl.has_lineunder then
ins(LINEUNDER)
end
ins(syl.coda)
end
if syllables.is_prefix then
ins("-")
end
return table.concat(parts)
end
local function decompose_respelling(text)
local dotover_keys = concat_keys(decompose_dotover)
return rsub(text, "[" .. dotover_keys .. "]", decompose_dotover)
end
local function canon_respelling(text)
local function canon_spaces(text)
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
return text
end
text = canon_spaces(text)
-- eliminate upside down punctuation
text = rsub(text, "[¡¿]", "")
-- eliminate utterance-final punctuation
text = rsub(text, "[!?.]$", "")
-- eliminate double and triple quotes
text = rsub(text, "''+", "")
-- Convert commas and em/en dashes to IPA foot boundaries; require a space after commas and en dashes (for the
-- latter, in particular, to avoid treating the en dash in 'Bose–Einstein condensate' as a foot boundary.
text = rsub(text, " *[,–] ", " | ")
text = rsub(text, " *[—] *", " | ")
-- ... in phrases like [[com es diu...en català]] and [[necessito ...]] become foot boundaries
text = rsub(text, " *%.%.%. *", " | ")
-- remaining commas and en dashes become spaces
text = rsub(text, "[,–]", " ")
-- may need to eliminate extraneous spaces again, e.g. if there was a space before or after an eliminated
-- punctuation mark
text = canon_spaces(text)
-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary
text = rsub(text, "([^ ]) *[!?] *([^ ])", "%1 | %2")
return text
end
local IPA_vowels_central = {
["ê"] = "ɛ", ["ë"] = "ɛ", ["ô"] = "ɔ",
}
local IPA_vowels_balearic = {
["ê"] = "ə", ["ë"] = "ɛ", ["ô"] = "ɔ",
}
local IPA_vowels_valencian = {
["ê"] = "e", ["ë"] = "e", ["ô"] = "o",
}
local IPA_vowels = {
["à"] = "a",
["è"] = "ɛ", ["ê"] = "ɛ", ["ë"] = "ɛ", ["é"] = "e",
["í"] = "i", ["ï"] = "i",
["ò"] = "ɔ", ["ô"] = "ɔ", ["ó"] = "o",
["ú"] = "u", ["ü"] = "u",
}
local function replace_context_free(cons)
cons = rsub(cons, "ŀ", "l")
cons = rsub(cons, "r", "ɾ")
cons = rsub(cons, "ɾɾ", "r")
cons = rsub(cons, "ss", "s")
cons = rsub(cons, "ll", "ʎ")
cons = rsub(cons, "ñ", "ɲ") -- hint ny > ñ
-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them
-- to tied multi-character affricates at the end of join_syllables().
cons = rsub(cons, "[dt]j", "ʤ")
cons = rsub(cons, "tx", "ʧ")
cons = rsub(cons, "[dt]z", "ʣ")
cons = rsub(cons, "ç", "s")
cons = rsub(cons, "[cq]", "k")
cons = rsub(cons, "h", "")
cons = rsub(cons, "j", "ʒ")
-- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.
cons = rsub(cons, "i", "j") -- must be after j > ʒ
cons = rsub(cons, "y", "j") -- must be after j > ʒ and fix_y
cons = rsub(cons, "[uü]", "w")
cons = rsub(cons, "'", "‿")
return cons
end
-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code
-- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the
-- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move
-- more of the work done in preprocess_word(), e.g. most of replace_context_free(), here.
local function postprocess_general(text, dialect)
local function verify(cond, msg)
if not cond then
error(("Internal error: %s; processed respelling at this point is '%s'"):format(msg, text))
end
return true
end
local voiced = listToSet {"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"}
local voiced_keys = concat_keys(voiced)
local voiceless = listToSet {"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"}
local voiceless_keys = concat_keys(voiceless)
local voicing = {["p"] = "b", ["t"] = "d", ["k"] = "g", ["f"] = "v", ["s"] = "z", ["ʃ"] = "ʒ", ["ʦ"] = "ʤ",
["ʧ"] = "ʤ"}
local voicing_keys = concat_keys(voicing)
local devoicing = {}
for k, v in pairs(voicing) do
devoicing[v] = k
end
local devoicing_keys = concat_keys(devoicing)
------------------ Handle <x>
-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles
-- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV.
text = rsub_repeatedly(text, "([.#][eɛ]" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V ..
")", function(e, pre, post)
-- Preserve other character separators (especially the tie character ‿).
pre = pre:gsub("%.", "")
post = post:gsub("%.", "")
return e .. pre .. "g.z" .. post
end)
-- -x- at the beginning of a coda becomes [ks], e.g. [[annex]], [[apèndix]], [[extracció]]; but not elsewhere in
-- the coda, e.g. in [[romanx]], [[ponx]]; words with [ks] in -nx such as [[esfinx]], [[linx]], [[manx]] need
-- respelling with [ks]; words ending in vowel + x like [[ídix]] need respelling with [ʃ]
text = rsub(text, "(" .. V .. stress_c .. "*)x", "%1ks")
if dialect == "val" then
-- Word-initial <x> as well as <x> after a consonant other than /j/ (including in the coda, e.g. [[ponx]])
-- becomes [t͡ʃ].
text = rsub(text, "#x", "#ʧ")
text = rsub(text, "([^" .. vowel_l .. separator_l .. "j]" .. charsep_c .. "*)x", "%1ʧ")
end
-- Other x becomes [ʃ]
text = rsub(text, "x", "ʃ")
-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries?
text = rsub(text, "s(" .. charsep_c .. "*)s", "%1s")
------------------ Coda consonant losses
-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when
-- absolutely word-finally before a vowel or end of utterance (e.g. [[blanc]] has /k/ in Balearic and
-- Valencian but not [[blancs]]). Must precede consonant assimilations.
local boundary = dialect == "cen" and "(.)" or "([^#])"
text = rsub(text, "m[pb]" .. boundary, "m%1")
text = rsub(text, "([ln])[td]" .. boundary, "%1%2")
text = rsub(text, "[nŋ][kg]" .. boundary, "ŋ%1")
if dialect == "val" or dialect == "bal" then
local before_cons = "(" .. separator_c .. "*" .. C .. ")"
text = rsub(text, "m[pb]" .. before_cons, "m%1")
text = rsub(text, "([ln])[td]" .. before_cons, "%1%2")
text = rsub(text, "[nŋ][kg]" .. before_cons, "ŋ%1")
end
-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and
-- t + lateral/nasal assimilation.
text = rsub(text, "st(" .. sylsep_c .. "*[^" .. neg_guts_of_cons .. "sɾ])", "s%1")
------------------ Consonant assimilations
if dialect == "cen" then
-- v > b in onsets (not in codas, e.g. [[ovni]] [ɔ́vni] and [[hafni]] [ávni]). This needs to precede
-- assimilation of nb -> mb.
text = rsub(text, "v(" .. C .. "*" .. V ..")", "b%1")
end
-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation
-- because there are too many exceptions, e.g. [[aritmètic]], [[atmosfèric]], [[ètnia]]. Instead, we require that
-- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either,
-- e.g. [[atlàntic]] has [əllántik] in GDLC but [adlántik] in DNV.
--
-- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words,
-- move stuff below that special-cases tll in Valencian here.
text = rsub(text, "t(" .. sylsep_c .. ")([lʎ])", "%2%1%2")
-- n + labial > labialized assimilation
text = rsub(text, "n(" .. separator_c .. "*[mbp])", "m%1")
text = rsub(text, "n(" .. separator_c .. "*[fv])", "ɱ%1")
-- n + velar > velarized assimilation
text = rsub(text, "n(" .. separator_c .. "*[kg])", "ŋ%1")
-- l/n + palatal > palatalized assimilation
text = rsub(text, "([ln])(" .. separator_c .. "*[ʎɲʃʒʧʤ])", function(ln, palatal)
ln = ({["l"] = "ʎ", ["n"] = "ɲ"})[ln]
return ln .. palatal
end)
-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in [[menys]] or derived terms like [[menyspreu]]
-- NOTE: Per [https://giec.iec.cat/textgramatica/codi/4.4], it does apply in these scenarios but the result is
-- somewhere between [s] and [ʃ], which is why it isn't shown in GDLC.
-- text = rsub(text, "ɲs", "%1ʃ")
------------------ Handle <r>
-- In replace_context_free(), we converted single r to ɾ and double rr to r.
if dialect == "cen" then
text = rsub(text, TEMP_PAREN_R, "")
text = rsub(text, TEMP_PAREN_RR, "r")
elseif dialect == "bal" then
text = rsub(text, TEMP_PAREN_R, "")
text = rsub(text, TEMP_PAREN_RR, "")
else
verify(dialect == "val", ("Unrecognized dialect '%s'"):format(dialect))
text = rsub(text, TEMP_PAREN_R, "ɾ")
text = rsub(text, TEMP_PAREN_RR, "ɾ")
end
if dialect ~= "val" then
-- Coda /ɾ/ -> /r/
-- FIXME: This is inherited from the older code. Correct?
text = rsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r")
end
-- ɾ -> r word-initially or after [lns]; needs to precede voicing assimilation as <s> will be voiced to [z] before
-- /ɾ/.
text = rsub(text, "([#lns]" .. sylsep_c .. "*)ɾ", "%1r")
------------------ Voicing assimilation
-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in
-- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we
-- do it character by character.
local chars = split_into_chars(text)
-- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because
-- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which
-- are always "##".)
for i = #chars - 2, 3, -1 do
-- We are looking for two consonants next to each other, possibly separated by a syllable or word divider.
-- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally.
-- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda.
local first = chars[i]
-- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which
-- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that
-- consonant (empty string forces devoicing).
local second
-- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning
-- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of <s> word-internally
-- separately, in preprocess_word() [FIXME: maybe move much of the processing in preprocess_word() into this
-- function].
local word_boundary_before_vowel
if not rfind(first, C) then
-- leave `second` at nil; no assimilation
elseif chars[i + 1] == "#" then -- word boundary
if chars[i + 2] == " " then
-- chars[i + 3] should always be "#"
verify(chars[i + 3] == "#", "Word boundary followed by space but not #")
if rfind(chars[i + 4], C) then
second = chars[i + 4]
else
second = ""
word_boundary_before_vowel = true
end
else
second = ""
end
elseif rfind(chars[i + 1], sylsep_c) then -- syllable boundary
if rfind(chars[i + 2], C) then
second = chars[i + 2]
else
second = ""
end
elseif rfind(chars[i + 1], C) then
second = chars[i + 1]
else
-- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation
end
if second then
-- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary.
local in_coda = false
local j = i - 1
while true do
verify(j > 0, "Missing word boundary at beginning of overall respelling")
if rfind(chars[j], "[" .. sylsep_l .. wordsep_l .. "]") then
break
elseif rfind(chars[j], V) then
in_coda = true
break
end
j = j - 1
end
if in_coda then
if word_boundary_before_vowel and rfind(first, "[zʒʣʤ]") then
-- leave alone
elseif voiced[second] and voicing[first] or word_boundary_before_vowel and rfind(first, "[sʃʦʧ]") then
chars[i] = voicing[first]
elseif (voiceless[second] or second == "") and devoicing[first] then
chars[i] = devoicing[first]
end
end
end
end
text = table.concat(chars)
-- gn -> ŋn e.g. [[regnar]] (including word-initial gn- e.g. [[gnòmic]], [[gneis]])
-- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words
-- (here I'm guessing yes).
if dialect ~= "cen" then
text = rsub(text, "#gn", "#n")
end
text = rsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")
-- gʒ > d͡ʒ
-- FIXME: We need to investigate if it operates across words
text = rsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ")
-- sʃ -> ʃ ([[desxifrar]]), zʒ -> ʒ ([[disjuntor]])
if dialect ~= "val" then
text = rsub(text, "s(" .. separator_c .. "*ʃ)", "%1")
text = rsub(text, "z(" .. separator_c .. "*ʒ)", "%1")
end
------------------ Gemination of <bl>, <gl>
if dialect ~= "val" then
-- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use <b_l> or <g_l>.
-- This must follow v > b above. To force a hard ungeminated [b] or [g], use <_b> or <_g>.
text = rsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")([bg])l", "%1%3%2%3l")
else -- Valencian; undo manually written 'bbl', 'ggl' in words like [[poblar]], [[reglament]]
text = rsub(text, "([bg])(" .. sylsep_c .. ")%1l", "%2%1l")
end
------------------ Lenition of voiced stops
-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a
-- vowel and (except for <d>) after <l> and <ll> (cf. GDLC [[cabellblanc]] [kəβɛ̀ʎβláŋ]). This also happens across
-- word boundaries but doesn't happen after stops, nor in Central Catalan after [r], [ɾ] or [z] (and hence probably
-- not after [ʒ] either, although I can't find any examples in GDLC).
--
-- In Valencian, <b> doesn't lenite (at least formally?), but <d> and <g> do lenite after [r], [ɾ] or [z].
--
-- Balearic is like Valencian in not leniting <b>, and probably like Central Catalan otherwise.
local lenite_bdg = {["b"] = "β", ["d"] = "ð", ["g"] = "ɣ"}
if dialect == "cen" then
text = rsub(text, "([" .. vowel_l .. "jwv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([bdg])",
function(before, bdg) return before .. lenite_bdg[bdg] end)
text = rsub(text, "([" .. vowel_l .. "lʎ]" .. separator_c .. "*[.#]" .. separator_c .. "*)([bg])",
function(before, bg) return before .. lenite_bdg[bg] end)
elseif dialect == "val" then
text = rsub(text, "([" .. vowel_l .. "jwvrɾzʣ]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",
function(before, dg) return before .. lenite_bdg[dg] end)
text = rsub(text, "([" .. vowel_l .. "lʎ]" .. separator_c .. "*[.#]" .. separator_c .. "*)(g)",
function(before, g) return before .. lenite_bdg[g] end)
else
verify(dialect == "bal", ("Unrecognized dialect '%s'"):format(dialect))
text = rsub(text, "([" .. vowel_l .. "jwv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",
function(before, dg) return before .. lenite_bdg[dg] end)
text = rsub(text, "([" .. vowel_l .. "lʎ]" .. separator_c .. "*[.#]" .. separator_c .. "*)(g)",
function(before, g) return before .. lenite_bdg[g] end)
end
------------------ Vowel reduction
-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan).
if dialect ~= "val" then
-- The following rules seem to apply, based on the old code:
-- (1) Stressed a and e are never reduced.
-- (2) Unstressed e directly following ə is not reduced.
-- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced.
-- (4) Written <ee> when both vowels precede the primary stress is reduced to [əə]. (This rule preempts #2.)
-- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all.
-- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was
-- implemented.
-- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances.
local words = rsplit(text, " ")
for j, word in ipairs(words) do
local chars = split_into_chars(word)
-- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we
-- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we
-- shoudn't run off the end (and have assertions to check this).
local seen_primary_stress = false
for i = 2, #chars - 1 do
local this = chars[i]
if chars[i] == AC then
seen_primary_stress = true
end
if (this ~= "a" and this ~= "e") or rfind(chars[i + 1], stress_c) then
-- Not a/e, or a stressed vowel; continue
else
local reduction = true
local prev, prev_stress, nxt, nxt_stress
if not rfind(chars[i - 1], sylsep_c) then
prev = ""
else
prev = chars[i - 2] -- this should be non-nil as chars[i - 1] is a syllable separator (not #)
verify(prev, "Missing # at word boundary")
prev_stress = ""
if rfind(prev, stress_c) then
prev_stress = prev
prev = chars[i - 3]
-- As above; chars[i - 2] is a stress indicator (not #).
verify(prev, "Missing # at word boundary")
end
end
if not rfind(chars[i + 1], sylsep_c) then
nxt = ""
-- leave nxt at nil
else
nxt = chars[i + 2]
nxt_stress = chars[i + 3]
-- chars[i + 1] is a syllable separator, so chars[i + 2] should not be a word boundary, so
-- chars[i + 3] should exist.
verify(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary")
end
if this == "e" and rfind(prev, "ə") then
reduction = false
elseif this == "e" and rfind(nxt, "[aɔ]") then
reduction = false
elseif this == "e" and nxt == "e" and not rfind(nxt_stress, AC) then
-- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary.
if not seen_primary_stress then
chars[i + 2] = "ə"
else
reduction = false
end
end
if reduction then
chars[i] = "ə"
end
end
end
words[j] = table.concat(chars)
end
text = table.concat(words, " ")
end
if dialect == "cen" then
-- Reduction of unstressed o (not before w)
text = rsub(text, "o([^" .. stress_l .. "w])", "u%1")
elseif dialect == "bal" then
-- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/;
-- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks
-- or word separators).
text = rsub(text, "o([^" .. vowel_l .. stress_l .. wordsep_l .. "]*[iu]" .. stress_c .. ")", "u%1")
end
-- Final losses.
text = rsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/
text = rsub(text, "([ʃʧs])s#", "%1#") -- homophone plurals -xs, -igs, -çs
if dialect ~= "val" then
-- Remove j before palatal obstruents
text = rsub(text, "j(" .. sylsep_c .. "*[ʃʒʧʤ])", "%1")
else -- Valencian
-- Fortition of palatal fricatives
text = rsub(text, "ʒ", "ʤ")
text = rsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z")
end
if dialect ~= "cen" then
-- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic.
-- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling
-- instead.
text = rsub(text, "([bpw]a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "([mv]e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(m[oɔ]" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l")
text = rsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1")
end
---------- Convert pseudo-symbols to real ones.
-- Convert g to IPA ɡ.
text = rsub(text, "g", "ɡ")
-- Convert pseudo-afficate symbols to full affricates.
local full_affricates = { ["ʦ"] = "t͡s", ["ʣ"] = "d͡z", ["ʧ"] = "t͡ʃ", ["ʤ"] = "d͡ʒ" }
text = rsub(text, "([ʦʣʧʤ])", full_affricates)
---------- Generate IPA stress marks.
-- Convert acute and grave to IPA stress marks.
text = rsub(text, AC, "ˈ")
text = rsub(text, GR, "ˌ")
-- Move IPA stress marks to the beginning of the syllable.
text = rsub_repeatedly(text, "([#.])([^#.]*)(" .. ipa_stress_c .. ")", "%1%3%2")
-- Suppress syllable divider before IPA stress indicator.
text = rsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1")
-- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above.
-- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in [[Module:pt-pronunc]].
-- text = rsub_repeatedly(text, "ˈ([^ ]+)ˈ", "ˌ%1ˈ")
-- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.)
-- text = rsub_repeatedly(text, "ˈ([^#]*#" .. PREFIX_MARKER .. ")", "ˌ%1")
-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant
-- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose.
text = rsub(text, "[#_" .. PSEUDOCONS .. "]", "")
text = mw.ustring.toNFC(text)
return text
end
local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Stressed vowel is ambiguous
if syllables.stress then
local stressed_vowel = syllables[syllables.stress].vowel
if rfind(stressed_vowel, "[eo]") then
local marks = {["e"] = {AC, GR, CFLEX, DIA}, ["o"] = {AC, GR, CFLEX}}
local marked_vowels = {}
for _, mark in ipairs(marks[stressed_vowel]) do
table.insert(marked_vowels, stressed_vowel .. mark)
end
error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " ..
"grave, or combined accent: %s."):format(orig_word, stressed_vowel,
mw.text.listToText(marked_vowels, nil, " or ")))
end
end
-- Final -r is ambiguous in many cases.
local final = syllables[#syllables]
-- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE:
-- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and
-- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in
-- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives,
-- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final
-- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r).
if #syllables > 1 and final.stressed then
if final.coda == "r" and rfind(final.vowel, "[aàiíé]") or final.coda == "rs" and final.vowel == "é" or
final.vowel == "ó" and rfind(final.coda, "^rs?$") and rfind(final.onset, "[stdç]") then
final.coda = TEMP_PAREN_R
end
end
if rfind(final.coda, "^rs?$") or rfind(final.coda, "[^r]rs?$") then
error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " ..
"-ir, in the nominal or adjectival endings -er(s) and -[dtsç]or(s). In all other cases it needs to be " ..
"rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " ..
"'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " ..
"nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " ..
"need rewriting using 'rr'."):format(orig_word))
end
local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}
for key, val in ipairs(syllables) do
syllables_IPA[key] = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed}
end
-- Replace letters with IPA equivalents
for i, syll in ipairs(syllables_IPA) do
-- Voicing of s
if syll.onset == "s" and i > 1 and rfind(syllables[i - 1].coda, "^[iu]?$") then
syll.onset = "z"
end
if rfind(syll.vowel, "^[eèéêëií]$") then
syll.onset = rsub(syll.onset, "tg$", "ʤ")
syll.onset = rsub(syll.onset, "[cg]$", {["c"] = "s", ["g"] = "ʒ"})
syll.onset = rsub(syll.onset, "[qg]u$", {["qu"] = "k", ["gu"] = "g"})
end
syll.coda = rsub(syll.coda, "igs?$", "iʤ")
syll.onset = replace_context_free(syll.onset)
syll.coda = replace_context_free(syll.coda)
syll.vowel = rsub(syll.vowel, ".",
dialect == "cen" and IPA_vowels_central or
dialect == "bal" and IPA_vowels_balearic or
IPA_vowels_valencian
)
syll.vowel = rsub(syll.vowel, ".", IPA_vowels)
end
for _, suffix_syl in ipairs(suffix_syllables) do
table.insert(syllables_IPA, suffix_syl)
end
return syllables_IPA
end
-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete
-- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param.
-- `whole_word`, if set, indicates that the match must be to a whole word (it was preceded by ~).
local function convert_single_substitution_to_original(to, pagename, whole_word)
-- Replace specially-handled characters with a class matching the character and possible replacements.
local escaped_from = to
-- Handling of '(rr)', '(r)', '.' and '-' needs to be done before calling pattern_escape(); otherwise they will be
-- escaped.
escaped_from = escaped_from:gsub("%(rr%)", "r")
escaped_from = escaped_from:gsub("%(r%)", "r")
escaped_from = escaped_from:gsub("ks", "x"):gsub("Ks", "X"):gsub("gz", "x"):gsub("([bg])%1l", "%1l"):gsub("[._]", "")
escaped_from = require(strutil_module).pattern_escape(escaped_from)
escaped_from = escaped_from:gsub("rr", "rr?")
escaped_from = escaped_from:gsub("ss", "ss?")
escaped_from = escaped_from:gsub("ʃ", "[xX]")
-- This is tricky, because we already passed `escaped_from` through pattern_escape() causing a hyphen to get a
-- % sign before it, and have to double up the percent signs to match and replace a literal %.
escaped_from = escaped_from:gsub("%%%-", "%%-?")
-- Tie sign (‿) should match against space, hyphen or nothing in the original.
escaped_from = escaped_from:gsub("‿", "[ %%-]?")
escaped_from = rsub(escaped_from, "[" .. written_accented_vowel_l .. "]",
function(v) return "[" .. v .. written_accented_to_plain_vowel[v] .. "]" end)
escaped_from = escaped_from:gsub(DOTOVER, DOTOVER .. "?"):gsub(LINEUNDER, LINEUNDER .. "?")
escaped_from = "(" .. escaped_from .. ")"
if whole_word then
escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]"
end
local match = rmatch(pagename, escaped_from)
if match then
if match == to then
error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):
format(to, pagename))
end
return match
end
error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename))
end
local function apply_substitution_spec(respelling, pagename, pos, allow_mid_vowel_hints, parse_err)
local subs = split_on_comma(rmatch(respelling, "^%[(.*)%]$"))
respelling = pagename
local mid_vowel_hint
local regular_subs = {}
for _, sub in ipairs(subs) do
if rfind(sub, "^" .. export.mid_vowel_hint_c .. "$") then
if mid_vowel_hint then
parse_err(("Specified mid vowel hint twice, '%s' and '%s'"):format(
mid_vowel_hint, sub))
end
mid_vowel_hint = sub
else
table.insert(regular_subs, sub)
end
end
if mid_vowel_hint then
if not allow_mid_vowel_hints then
parse_err(("Mid vowel hint '%s' not allowed when apply one substitution spec to multiple words"):format(
mid_vowel_hint))
end
local suffix = ""
-- FIXME: This duplicates logic in to_IPA().
if not pos or pos == "adverb" then
local part_before_ment, ment = rmatch(respelling, "^(.*)(m[eé]nt)$")
if part_before_ment and (pos == "adverb" or not rfind(part_before_ment, "[iï]$") and
rfind(part_before_ment, V .. ".*" .. V)) then
suffix = ment
respelling = part_before_ment
end
end
local syllables = split_syllables(respelling, "stress prefixes", "may be uppercase")
local stressed_vowel = syllables[syllables.stress].vowel
if stressed_vowel == mid_vowel_hint then
-- do nothing
elseif rfind(mid_vowel_hint, "[èéêë]") and rfind(stressed_vowel, "[eEèÈ]") or
rfind(mid_vowel_hint, "[òóô]") and rfind(stressed_vowel, "[oO]") then
syllables[syllables.stress].vowel = mid_vowel_hint
else
parse_err(("Stressed vowel '%s' not compatible with mid vowel hint '%s'"):format(
stressed_vowel, mid_vowel_hint))
end
respelling = reconstitute_word_from_syllables(syllables) .. suffix
end
for _, sub in ipairs(regular_subs) do
local from, escaped_from, to, escaped_to, whole_word
if rfind(sub, "^~") then
-- whole-word match
sub = rmatch(sub, "^~(.*)$")
whole_word = true
end
if sub:find(":") then
from, to = rmatch(sub, "^(.-):(.*)$")
else
to = sub
from = convert_single_substitution_to_original(to, pagename, whole_word)
end
if from then
local strutil = require(strutil_module)
escaped_from = strutil.pattern_escape(from)
if whole_word then
escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]"
end
escaped_to = strutil.replacement_escape(to)
local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to)
if nsubs == 0 then
parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format(
from, to, respelling))
elseif nsubs > 1 then
parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " ..
"more context"):format(from, to, respelling))
else
respelling = subbed_respelling
end
end
end
return respelling
end
local canonicalize_pos = {
n = "noun",
noun = "noun",
v = "verb",
vb = "verb",
verb = "verb",
a = "adjective",
adj = "adjective",
adjective = "adjective",
av = "adverb",
adv = "adverb",
adverb = "adverb",
o = "other",
other = "other",
}
local function parse_off_pos(respelling, parse_err)
local pos, rest = respelling:match("^([a-z]+)/(.*)$")
if pos then
if not canonicalize_pos[pos] then
local valid_pos = {}
for vp, _ in pairs(canonicalize_pos) do
table.insert(valid_pos, vp)
end
table.sort(valid_pos)
parse_err(("Unrecognized part of speech '%s', should be one of %s"):format(pos,
table.concat(valid_pos, ", ")))
end
pos = canonicalize_pos[pos]
respelling = rest
if respelling == "" then
respelling = "+"
end
end
return pos, respelling
end
-- Parse a respelling given by the user, allowing for '+' for pagename, mid vowel hints in place of a respelling and
-- substitution specs like '[ks]' or [val:vol,ê,ks]. In general, return an object {words = {WORD, WORD, ...}} where
-- WORD is of the form {term = PARSED_RESPELLING, pos = POS}. Other fields are set in special cases: If a raw respelling
-- was seen, the fields `raw_phonemic` and/or `raw_phonetic` are set; if '?' is seen, the field `unknown` is set; and if
-- '-' is seen, the field `omitted` is set.
local function parse_respelling(respelling, pagename, parse_err)
if respelling == "?" then
return {
unknown = true
}
end
if respelling == "-" then
return {
omitted = true
}
end
local saw_raw
local remaining_respelling = respelling:match("^raw:(.*)$")
if remaining_respelling then
saw_raw = true
respelling = remaining_respelling
end
local raw_phonemic, raw_phonetic = respelling:match("^/(.*)/ %[(.*)%]$")
if not raw_phonemic then
raw_phonemic = respelling:match("^/(.*)/$")
end
if not raw_phonemic and saw_raw then
raw_phonetic = respelling:match("^%[(.*)%]$")
end
if raw_phonemic or raw_phonetic then
return {
raw_phonemic = raw_phonemic,
raw_phonetic = raw_phonetic,
}
end
pagename = decompose_respelling(pagename)
respelling = decompose_respelling(respelling)
local function split_respelling_into_words(respelling, parse_pos)
respelling = canon_respelling(respelling)
local word_objs = {}
local respelling_words = rsplit(respelling, " ")
for _, word in ipairs(respelling_words) do
local pos
if parse_pos then
pos, word = parse_off_pos(word, parse_err)
end
table.insert(word_objs, {term = word, pos = pos})
end
return {words = word_objs}
end
local function substitute_respelling_word(respelling_word, pagename_word)
local pos
pos, respelling_word = parse_off_pos(respelling_word, parse_err)
if respelling_word == "+" then
respelling_word = pagename_word
else
if rfind(respelling_word, "^" .. export.mid_vowel_hint_c .. "$") then
respelling_word = "[" .. respelling_word .. "]"
end
if rfind(respelling_word, "^%[.*%]$") then
respelling_word = apply_substitution_spec(respelling_word, pagename_word, pos,
"allow mid vowel hint", parse_err)
end
end
return {term = respelling_word, pos = pos}
end
-- At this point, if there are multiple words in the pagename, there are three syntaxes allowed: all-at-once,
-- replacement or word-by-word. All-at-once syntax involves either a + representing the entire pagename, or a
-- substitution spec that applies to all words in the pagename. This syntax cannot have a prefixed part of speech
-- because it wouldn't be clear which word to apply the part of speech to. Replacement syntax simply spells out the
-- respelling without any substitution specs or +'s (but possibly with parts of speech prefixed to individual
-- words), and can have a different number of words than the pagename (essentially, the pagename is disregarded).
-- Word-by-word syntax involves a combination of respelled words, per-word substitution specs and/or a +
-- representing an individual word, and must have the same number of words as the pagename so that substitution
-- specs and +'s can be lined up with words in the pagename. In all cases, the return value is in the same format;
-- see comment at top of function.
if pagename:find(" ") or respelling:find(" ") then
if respelling == "+" then
return split_respelling_into_words(pagename)
elseif rfind(respelling, "^%[.*%]$") then
-- all-at-once syntax with substitution spec
return split_respelling_into_words(apply_substitution_spec(respelling, pagename, nil, false, parse_err))
elseif rfind(respelling, "^([a-z]+)/$") or rfind(respelling, "^([a-z]+)/%[[^%[%]]*%]$") then
-- attempt to include a part of speech in all-at-once syntax
parse_err(("Part of speech not allowed when pagename is multiword and all-at-once syntax is used in " ..
"the respelling, but saw '%s'"):format(respelling))
elseif rfind(respelling, "^" .. export.mid_vowel_hint_c .. "$") then
-- attempt to use a mid-vowel hint in all-at-once syntax
parse_err(("Single mid-vowel hint not allowed when pagename is multiword because it's not clear which " ..
"word to apply it to, but saw '%s'"):format(respelling))
elseif rfind(respelling, "[+%[%]]") or rfind(respelling, "^" .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. " ") or
rfind(respelling, " " .. export.mid_vowel_hint_c .. "$") then
-- word-by-word syntax
local sub_with_space = rmatch(respelling, "%[[^%[%]]* [^%[%]]*%]")
if sub_with_space then
parse_err(("When using word-by-word syntax with a multiword pagename, saw substitution spec '%s' " ..
"with spaces, which is not allowed because it must match a single word"):format(sub_with_space))
end
pagename = canon_respelling(pagename)
respelling = canon_respelling(respelling)
local pagename_words = rsplit(pagename, " ")
local respelling_words = rsplit(respelling, " ")
if #pagename_words ~= #respelling_words then
parse_err(("When using word-by-word syntax with a multiword pagename, saw %s words in pagename but " ..
"%s word%s in respelling; they need to match"):format(#pagename_words, #respelling_words,
#respelling_words > 1 and "s" or ""))
end
local word_objs = {}
for i = 1, #pagename_words do
table.insert(word_objs, substitute_respelling_word(respelling_words[i], pagename_words[i]))
end
return {words = word_objs}
else
-- replacement syntax; pagename ignored
return split_respelling_into_words(respelling, "parse pos")
end
else
local word_obj = substitute_respelling_word(respelling, pagename)
word_obj.term = canon_respelling(word_obj.term)
return {words = {word_obj}}
end
end
-- Parse a list of comma-split runs containing one or more respellings, i.e. after calling parse_balanced_segment_run()
-- or the like followed by split_alternating_runs() or the like (see [[Module:parse utilities]]). `pagename` is the
-- pagename, for use when a respelling is just '+', a mid-vowel hint like 'ê' or a substitution spec like '[ks]'.
-- `original_input` is the raw input and `input_param` the name of the param containing the raw input; both are used
-- only in error messages. Return an object specifying the respellings, currently with a single field 'terms' (this
-- format is used in case other outer properties exist in the future), where 'terms' is a list of term objects. Each
-- term object contains either a field `term` with the respelling and an optional part of speech `pos`, or fields
-- `raw_phonemic` and/or `raw_phonetic` (if the user specified raw IPA using "/.../" or "/.../ [...]" or "raw:[...]"),
-- `unknown` (if the user specified "?"), or `omitted` (if the user specified "-"). In addition, there may be fields
-- `q`, `qq`, `a`, `aa`, and/or `ref` corresponding to inline modifiers. Each such field is a list; all are lists of
-- strings except for `ref`, which is a list of objects as returned by parse_references() in [[Module:references]].
function export.parse_comma_separated_groups(comma_separated_groups, pagename, original_input, input_param)
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename == true and respelling or pagename, parse_err)
end
local put = require(parse_utilities_module)
local outer_container = {terms = {}}
for _, group in ipairs(comma_separated_groups) do
-- Rejoin runs that don't involve <...>.
local j = 2
while j <= #group do
if not group[j]:find("^<.*>$") then
group[j - 1] = group[j - 1] .. group[j] .. group[j + 1]
table.remove(group, j)
table.remove(group, j)
else
j = j + 2
end
end
local param_mods = {
-- pre = { overall = true },
-- post = { overall = true },
ref = { store = "insert", convert = function(arg, parse_err)
return require("Module:references").parse_references(arg)
end },
q = { store = "insert" },
qq = { store = "insert" },
a = { store = "insert" },
aa = { store = "insert" },
}
table.insert(outer_container.terms, put.parse_inline_modifiers_from_segments {
group = group,
arg = original_input,
props = {
paramname = input_param,
param_mods = param_mods,
generate_obj = generate_obj,
splitchar = ",",
outer_container = outer_container,
},
})
end
return outer_container
end
-- Generate the pronunciation of `words` (a list of word objects representing respellings, each of which is an object
-- of the form {term = RESPELLING, pos = PART_OF_SPEECH} in `dialect` ("cen", "bal" or "val").
local function to_IPA(words, dialect)
local pronuns = {}
for _, wordobj in ipairs(words) do
if rfind(wordobj.term, "[áìùÁÌÙ]") then
error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions"
):format(wordobj.term))
end
end
words = handle_unstressed_words(words)
for _, wordobj in ipairs(words) do
local word = wordobj.term
local pos = wordobj.pos
local suffix_syllables = {}
local orig_word = word
word = ulower(word)
if not pos or pos == "adverb" then
local word_before_ment, ment = rmatch(word, "^(.*)(m[eé]nt)$")
if word_before_ment and (pos == "adverb" or not rfind(word_before_ment, "[iï]$") and
rfind(word_before_ment, V .. ".*" .. V)) then
suffix_syllables = {{onset = "m", vowel = "e", coda = "nt", stressed = true}}
pos = "adjective"
word = word_before_ment
end
end
word = word_fixes(word, dialect)
local syllables = split_syllables(word)
syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word)
-- Combine syllables.
local combined = {}
local has_ment = #suffix_syllables > 0
for i, syll in ipairs(syllables) do
local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or
has_ment and i == #syllables) and AC or -- primary stress
syllables[i].stressed and GR or -- secondary stress
""
table.insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda)
end
table.insert(pronuns, table.concat(combined, "."))
end
-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |).
-- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be
-- '##foo# #bar# #baz## #|# ##bat# #quux##'.
local text = "##" .. table.concat(pronuns, " ") .. "##"
text = rsub(text, " | ", "# | #")
text = rsub(text, " ", "# #")
return postprocess_general(text, dialect)
end
-- Generate the phonemic and phonetic pronunciations of the respellings in `parsed_respellings`, which is a table whose
-- keys are dialect identifiers (e.g. "cen" for Central Catalan, "val" for Valencian) and whose values are objects of
-- the format returned by parse_comma_separated_groups() (see comment above that function). This destructively modifies
-- `parsed_respellings`, adding fields `phonemic` and `phonetic` containing the generated pronunciations and removing
-- the input fields used to generate those output fields. (FIXME: Currently only phonetic pronunciation is generated.)
function export.generate_phonemic_phonetic(parsed_respellings)
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
for dialect, respelling_spec in pairs(parsed_respellings) do
for _, termobj in ipairs(respelling_spec.terms) do
if termobj.unknown or termobj.omitted then
-- leave alone, will handle later
elseif termobj.raw_phonemic or termobj.raw_phonetic then
termobj.phonemic = termobj.raw_phonemic
termobj.phonetic = termobj.raw_phonetic
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.raw_phonemic = nil
termobj.raw_phonetic = nil
else
termobj.phonetic = to_IPA(termobj.words, dialect)
-- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers
termobj.words = nil
end
end
end
end
-- Group pronunciations by dialect, i.e. grouping pronunciations that are identical in every way (including both the
-- pronunciation(s) and any qualifiers and other inline modifiers). `parsed_respellings` contains the output from
-- generate_phonemic_phonetic(), and the return value is a list of grouped pronunciations, where each object in the list
-- contains fields `dialects` (a list of dialects containing the pronunciations) and `pronuns` (a list of
-- pronunciations, where each pronunciation is specified by an object containing fields `phonemic` and `phonetic`, as
-- generated by generate_phonemic_phonetic(), along with any inline modifier fields `q`, `qq`, `a`, `aa` and/or `ref`).
function export.group_pronuns_by_dialect(parsed_respellings)
local grouped_pronuns = {}
for dialect, pronun_spec in pairs(parsed_respellings) do
local saw_omitted = false
for _, termobj in ipairs(pronun_spec.terms) do
if termobj.omitted then
saw_omitted = true
break
end
end
if not saw_omitted then
local saw_existing = false
for _, group in ipairs(grouped_pronuns) do
if m_table.deepEquals(group.pronuns, pronun_spec.terms) then
table.insert(group.dialects, dialect)
saw_existing = true
break
end
end
if not saw_existing then
table.insert(grouped_pronuns, {dialects = {dialect}, pronuns = pronun_spec.terms})
end
end
end
return grouped_pronuns
end
-- Format pronunciations grouped by dialect. `grouped_pronuns` contains the output of group_pronuns_by_dialect().
-- This destructively modifies `grouped_pronuns`, adding a field 'formatted' to the first-level values of
-- `grouped_pronuns` containing the formatted pronunciation(s) for a given set of dialects.
function export.format_grouped_pronunciations(grouped_pronuns)
for _, grouped_pronun_spec in pairs(grouped_pronuns) do
local pronunciations = {}
-- Loop through each pronunciation. For each one, add the phonemic and phonetic versions to `pronunciations`,
-- for formatting by [[Module:IPA]] or raw (for use in [[Module:ca-headword]]).
for j, pronun in ipairs(grouped_pronun_spec.pronuns) do
-- Add dialect tags to left accent qualifiers if first one
local as = pronun.a
if j == 1 then
if as then
as = m_table.deepCopy(as)
else
as = {}
end
for _, dialect in ipairs(grouped_pronun_spec.dialects) do
table.insert(as, export.dialects_to_names[dialect])
end
end
local first_pronun = #pronunciations + 1
if pronun.unknown then
-- FIXME: This is a massive hack but it works for now.
table.insert(pronunciations, { pron = "", pretext = "''unknown''" })
else
if not pronun.phonemic and not pronun.phonetic then
error("Internal error: Saw neither phonemic nor phonetic pronunciation")
end
if pronun.phonemic then -- missing if 'raw:[...]' given
local slash_pron = "/" .. pronun.phonemic .. "/"
table.insert(pronunciations, {
pron = slash_pron,
})
end
if pronun.phonetic then -- missing if '/.../' given
local bracket_pron = "[" .. pronun.phonetic .. "]"
table.insert(pronunciations, {
pron = bracket_pron,
})
end
end
local last_pronun = #pronunciations
if pronun.q then
pronunciations[first_pronun].q = pronun.q
end
if as then
pronunciations[first_pronun].a = as
end
if j > 1 then
pronunciations[first_pronun].separator = ", "
end
if pronun.qq then
pronunciations[last_pronun].qq = pronun.qq
end
if pronun.aa then
pronunciations[last_pronun].aa = pronun.aa
end
if pronun.refs then
pronunciations[last_pronun].refs = pronun.refs
end
if first_pronun ~= last_pronun then
pronunciations[last_pronun].separator = " "
end
end
grouped_pronun_spec.formatted = m_IPA.format_IPA_full {
lang = lang,
items = pronunciations,
separator = "",
}
end
end
function export.show(frame)
local params = {
[1] = {},
indent = {},
pagename = {} -- for testing or documentation pages
}
for _, dialect in ipairs(export.dialects) do
params[dialect] = {}
end
for dialect_group, _ in pairs(export.dialect_groups) do
params[dialect_group] = {}
end
local args = require("Module:parameters").process(frame:getParent().args, params)
local pagename = args.pagename or mw.title.getCurrentTitle().subpageText
-- Set inputs
local inputs = {}
-- If 1= specified, do all dialects.
if args[1] then
for _, dialect in ipairs(export.dialects) do
inputs[dialect] = {input = args[1], param = 1}
end
end
-- Then do dialect groups.
for dialect_group, group_dialects in pairs(export.dialect_groups) do
if args[dialect_group] then
for _, dialect in ipairs(group_dialects) do
inputs[dialect] = {input = args[dialect_group], param = dialect_group}
end
end
end
-- Then do individual dialect settings.
for _, dialect in ipairs(export.dialects) do
if args[dialect] then
inputs[dialect] = {input = args[dialect], param = dialect}
end
end
-- If no inputs given, set all dialects based on current pagename.
if not next(inputs) then
for _, dialect in ipairs(export.dialects) do
inputs[dialect] = {input = "+", param = "(pagename)"}
end
end
-- Parse the arguments.
local parsed_respellings = {}
for dialect, inputspec in pairs(inputs) do
local function generate_obj(respelling, parse_err)
return parse_respelling(respelling, pagename, parse_err)
end
if inputspec.input:find("[<%[]") then
local put = require(parse_utilities_module)
-- Parse balanced segment runs involving either [...] (substitution notation) or <...> (inline modifiers).
-- We do this because we don't want commas inside of square or angle brackets to count as respelling
-- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting
-- alternating runs on comma. For example, if we are given
-- "a[x]a<q:learned>,[vol:vôl,ks]<q:nonstandard>", after calling
-- parse_multi_delimiter_balanced_segment_run() we get the following output:
--
-- {"a", "[x]", "a", "<q:learned>", ",", "[vol:vôl,ks]", "", "<q:nonstandard>", ""}
--
-- After calling split_alternating_runs(), we get the following:
--
-- {{"a", "[x]", "a", "<q:learned>", ""}, {"", "[vol:vôl,ks]", "", "<q:nonstandard>", ""}}
--
-- We need to rejoin stuff on either side of the square-bracketed portions.
local segments = put.parse_multi_delimiter_balanced_segment_run(inputspec.input, {{"<", ">"}, {"[", "]"}})
local comma_separated_groups = put.split_alternating_runs_on_comma(segments)
-- Process each value.
local outer_container = export.parse_comma_separated_groups(comma_separated_groups, pagename,
inputspec.input, inputspec.param)
parsed_respellings[dialect] = outer_container
else
local termobjs = {}
local function parse_err(msg)
error(msg .. ": " .. inputspec.param .. "=" .. inputspec.input)
end
for _, term in ipairs(split_on_comma(inputspec.input)) do
table.insert(termobjs, generate_obj(term, parse_err))
end
parsed_respellings[dialect] = {
terms = termobjs,
}
end
end
-- Convert each canonicalized respelling to phonemic/phonetic IPA.
export.generate_phonemic_phonetic(parsed_respellings)
-- Group the results.
local grouped_pronuns = export.group_pronuns_by_dialect(parsed_respellings)
-- Format the results.
export.format_grouped_pronunciations(grouped_pronuns)
-- Concatenate formatted results.
local formatted = {}
for _, grouped_pronun_spec in ipairs(grouped_pronuns) do
table.insert(formatted, grouped_pronun_spec.formatted)
end
local indent = (args.indent or "*") .. " "
local out = table.concat(formatted, "\n" .. indent)
if args.indent then
out = indent .. out
end
return out
end
-- Used by [[Module:ca-IPA/testcases]].
function export.test(pagename, respelling, dialect)
local function parse_err(msg)
error(msg)
end
local parsed = parse_respelling(respelling, pagename, parse_err)
return to_IPA(parsed.words, dialect)
end
return export