local export = {} local m_grc_utils = require("Module:grc-utilities") local m_grc_utils_data = require("Module:grc-utilities/data") local m_str_utils = require("Module:string utilities") local tokenize = require("Module:grc-utilities").tokenize local canonicalize = m_grc_utils.canonicalize local concat = table.concat local insert = table.insert local split = m_str_utils.split local u = m_str_utils.char local ugsub = m_str_utils.gsub local ulower = m_str_utils.lower local umatch = mw.ustring.match local uupper = m_str_utils.upper -- Diacritics local diacritic = m_grc_utils_data.diacritic local diacritics = m_grc_utils_data.diacritics -- Greek local circumflex = diacritics.circum local smooth = diacritics.smooth local rough = diacritics.rough local breve = diacritics.breve local macron = diacritics.macron local subscript = diacritics.subscript local vowel = m_grc_utils_data.vowel -- Latin local hat = diacritics.Latin_circum local a_subscript = "^α.*" .. subscript .. "$" local question_mark = u(0x37E) local velar = "[γκξχϙ]" local long_vowels = { -- Macron will be added. ["η"] = "e", ["ω"] = "o", } local tt = { -- Vowels ["α"] = "a", ["ε"] = "e", ["ι"] = "i", ["ο"] = "o", ["υ"] = "u", -- Consonants ["β"] = "b", ["γ"] = "g", ["δ"] = "d", ["ζ"] = "z", ["θ"] = "th", ["κ"] = "k", ["λ"] = "l", ["μ"] = "m", ["ν"] = "n", ["ξ"] = "x", ["π"] = "p", ["ρ"] = "r", ["σ"] = "s", ["ς"] = "s", ["τ"] = "t", ["φ"] = "ph", ["χ"] = "kh", ["ψ"] = "ps", -- Other letters ["ϛ"] = "st", ["ϝ"] = "w", ["ͱ"] = "h", ["ϳ"] = "j", ["ϙ"] = "q", ["ϻ"] = "s", ["ϸ"] = "š", ["ͳ"] = "s", --["ͷ"] = "v", Differs by dialect. -- Diacritics -- unchanged: macron, diaeresis, grave, acute [smooth] = "", [rough] = "", [circumflex] = hat, [subscript] = "i", } local function get_next_token(tokens, i) local new = i + 1 local token = tokens[new] while token and token:match("[()[%]{}]") do new = new + 1 token = tokens[new] end return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1) end local function translit_letter(letter, trail) local tr = long_vowels[letter] return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt) end local function do_translit(token) return ugsub(token, "(.)(%W*)", translit_letter) end local function remove_macron_if_hat(m) return m:find(hat) and m:gsub(macron, "") or m end local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) -- Remove any duplicate diacritics (this shouldn't really happen). local n repeat translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2") until n == 0 -- Remove macron from a vowel that has a circumflex. translit = ugsub(translit, "%W+", remove_macron_if_hat) -- If capitalized, only capitalize the first letter unless the following token is capitalized as well. insert( output, (token == lower_token and translit or next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or uupper(translit) ) .. suffix ) end function export.tr(text, lang, sc) if text == "῾" then return "h" end --[[ Replace semicolon or Greek question mark with regular question mark, except any that occur in HTML entities. Use split to separate out the chunks between any entities. ]] text = split(canonicalize(text), "(&#?%w+;)") for i = 1, #text, 2 do text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?") end text = concat(text) -- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. text = text:gsub("·", ";") local tokens = tokenize(text) --now read the tokens local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0) local output = {suffix} while next_token do local i, token, lower_token, is_rough = next_i, next_token, next_token_lower local translit = do_translit(lower_token) next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) -- γ before a velar should be <n> if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then translit = translit:gsub("g", "n") elseif lang == "xbc" and lower_token:find("φ") then translit = translit:gsub("ph", "f") elseif token == "ρ"..rough then translit = "rh" elseif token == "ρ"..smooth then translit = "r" -- ρ after ρ should be <rh> elseif lower_token:find("ρ") then -- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended. while next_token_lower and next_token_lower:find("ρ") do insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true translit = do_translit(lower_token) next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i) end -- add macron to ᾳ elseif umatch(lower_token, a_subscript) then translit = translit:gsub("a", "a" .. macron) end if is_rough or lower_token:find(rough) then if umatch(lower_token, vowel) then translit = "h" .. translit else local final = umatch(translit, "(%w)%W*$") if final and final ~= "h" then translit = translit .. "h" end end end insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix) end return concat(output) end return export
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4