-- turkce-sayi.lua
-- Turkish suffix harmony for LaTeX-generated numbers.
-- Requires LuaTeX 1.10+ (Lua 5.3+).
-- License: LPPL 1.3c  https://www.latex-project.org/lppl.txt
-- Maintainer: Sonat Suer <sonatsuer@gmail.com>

local M = {}

------------------------------------------------------------------------
-- Phonological properties of the last spoken component of each number.
-- f: final vowel is front (e/i/ö/ü = true; a/ı/o/u = false)
-- r: final vowel is rounded (ö/ü/o/u = true)
-- v: word ends in a vowel (triggers buffer/drop rule)
-- d: last phoneme is voiced (vowel or voiced consonant)
------------------------------------------------------------------------
local WORD_PROPS = {
  [0]             = {f=false, r=false, v=false, d=true},  -- sıfır  (ı, r)
  [1]             = {f=true,  r=false, v=false, d=true},  -- bir    (i, r)
  [2]             = {f=true,  r=false, v=true,  d=true},  -- iki    (ends in i)
  [3]             = {f=true,  r=true,  v=false, d=false}, -- üç     (ü, ç voiceless)
  [4]             = {f=true,  r=true,  v=false, d=false}, -- dört   (ö, t voiceless)
  [5]             = {f=true,  r=false, v=false, d=false}, -- beş    (e, ş voiceless)
  [6]             = {f=false, r=false, v=true,  d=true},  -- altı   (ends in ı)
  [7]             = {f=true,  r=false, v=true,  d=true},  -- yedi   (ends in i)
  [8]             = {f=true,  r=false, v=false, d=true},  -- sekiz  (i, z)
  [9]             = {f=false, r=true,  v=false, d=true},  -- dokuz  (u, z)
  [10]            = {f=false, r=true,  v=false, d=true},  -- on     (o, n)
  [20]            = {f=true,  r=false, v=true,  d=true},  -- yirmi  (ends in i)
  [30]            = {f=false, r=true,  v=false, d=true},  -- otuz   (u, z)
  [40]            = {f=false, r=false, v=false, d=false}, -- kırk   (ı, k voiceless)
  [50]            = {f=true,  r=false, v=true,  d=true},  -- elli   (ends in i)
  [60]            = {f=false, r=false, v=false, d=false}, -- altmış (ı, ş voiceless)
  [70]            = {f=true,  r=false, v=false, d=false}, -- yetmiş (i, ş voiceless)
  [80]            = {f=true,  r=false, v=false, d=true},  -- seksen (e, n)
  [90]            = {f=false, r=false, v=false, d=true},  -- doksan (a, n)
  [100]           = {f=true,  r=true,  v=false, d=true},  -- yüz    (ü, z)
  [1000]          = {f=true,  r=false, v=false, d=true},  -- bin    (i, n)
  [1000000]       = {f=false, r=true,  v=false, d=true},  -- milyon (o, n)
  [1000000000]    = {f=false, r=false, v=false, d=true},  -- milyar (a, r)
  [1000000000000] = {f=false, r=true,  v=false, d=true},  -- trilyon(o, n)
}

local MAGNITUDES = {
  1000000000000, 1000000000, 1000000, 1000,
  100, 90, 80, 70, 60, 50, 40, 30, 20, 10,
}

local function last_word(n)
  n = math.floor(math.abs(n) + 0.5)
  if n == 0 then return WORD_PROPS[0] end
  local props = WORD_PROPS[0]
  for _, mag in ipairs(MAGNITUDES) do
    if math.floor(n / mag) > 0 then
      props = WORD_PROPS[mag]
      n = n % mag
    end
  end
  if n > 0 then props = WORD_PROPS[n] end
  return props
end

------------------------------------------------------------------------
-- Character classification (Unicode codepoints)
------------------------------------------------------------------------
-- Lowercase Turkish specials: ı=0x0131  ü=0x00FC  ö=0x00F6  ç=0x00E7  ş=0x015F  ğ=0x011F
-- Uppercase Turkish specials: İ=0x0130  Ü=0x00DC  Ö=0x00D6  Ç=0x00C7  Ş=0x015E  Ğ=0x011E
-- Turkish case rule: i(0x69) ↔ İ(0x0130),  ı(0x0131) ↔ I(0x0049)

local UPPER_TO_LOWER = {
  [0x41]=0x61,   [0x42]=0x62,   [0x43]=0x63,   [0x44]=0x64,
  [0x45]=0x65,   [0x46]=0x66,   [0x47]=0x67,   [0x48]=0x68,
  [0x49]=0x0131, -- I  → ı  (Turkish: capital I is dotless)
  [0x4A]=0x6A,   [0x4B]=0x6B,   [0x4C]=0x6C,   [0x4D]=0x6D,
  [0x4E]=0x6E,   [0x4F]=0x6F,   [0x50]=0x70,   [0x52]=0x72,
  [0x53]=0x73,   [0x54]=0x74,   [0x55]=0x75,   [0x56]=0x76,
  [0x59]=0x79,   [0x5A]=0x7A,
  [0x00C7]=0x00E7, -- Ç → ç
  [0x00D6]=0x00F6, -- Ö → ö
  [0x00DC]=0x00FC, -- Ü → ü
  [0x011E]=0x011F, -- Ğ → ğ
  [0x015E]=0x015F, -- Ş → ş
  [0x0130]=0x69,   -- İ → i  (Turkish: capital dotted İ)
}

local LOWER_TO_UPPER = {
  [0x61]=0x41,   [0x62]=0x42,   [0x63]=0x43,   [0x64]=0x44,
  [0x65]=0x45,   [0x66]=0x46,   [0x67]=0x47,   [0x68]=0x48,
  [0x69]=0x0130, -- i  → İ  (Turkish!)
  [0x6A]=0x4A,   [0x6B]=0x4B,   [0x6C]=0x4C,   [0x6D]=0x4D,
  [0x6E]=0x4E,   [0x6F]=0x4F,   [0x70]=0x50,   [0x72]=0x52,
  [0x73]=0x53,   [0x74]=0x54,   [0x75]=0x55,   [0x76]=0x56,
  [0x79]=0x59,   [0x7A]=0x5A,
  [0x00E7]=0x00C7, -- ç → Ç
  [0x00F6]=0x00D6, -- ö → Ö
  [0x00FC]=0x00DC, -- ü → Ü
  [0x011F]=0x011E, -- ğ → Ğ
  [0x015F]=0x015E, -- ş → Ş
  [0x0131]=0x49,   -- ı → I  (Turkish!)
}

local function is_letter(cp)
  return UPPER_TO_LOWER[cp] ~= nil
      or (cp >= 0x61 and cp <= 0x7A)   -- a-z
      or cp == 0x0131 or cp == 0x00FC or cp == 0x00F6
      or cp == 0x00E7 or cp == 0x015F or cp == 0x011F
end

-- Return the utf8 string for cp_out, uppercased when was_upper is true.
local function apply_case(cp_out, was_upper)
  if not was_upper then return utf8.char(cp_out) end
  return utf8.char(LOWER_TO_UPPER[cp_out] or cp_out)
end

local HARMONY_V = {          -- vowels that transform under harmony
  [0x61]=true, [0x65]=true, [0x69]=true, [0x75]=true,  -- a e i u
  [0x0131]=true, [0x00FC]=true,                         -- ı ü
}
local INVARIANT_V = { [0x6F]=true, [0x00F6]=true }     -- o ö (pass through)

local function is_vowel(cp)
  return HARMONY_V[cp] or INVARIANT_V[cp]
end

local VOICED_C = {
  [0x62]=true, [0x63]=true, [0x64]=true, [0x67]=true,
  [0x011F]=true,                                         -- ğ
  [0x6A]=true, [0x6C]=true, [0x6D]=true, [0x6E]=true,
  [0x72]=true, [0x76]=true, [0x79]=true, [0x7A]=true,
}

local function is_voiced(cp)
  return is_vowel(cp) or (VOICED_C[cp] == true)
end

-- 4-way harmony vowel: returns the codepoint.
local function vowel4cp(f, r)
  if     f and not r     then return 0x69    -- i
  elseif f     and r     then return 0x00FC  -- ü
  elseif not f and not r then return 0x0131  -- ı
  else                        return 0x75    -- u
  end
end

------------------------------------------------------------------------
-- Invariant morphemes (Büyük Ünlü Uyumuna Uymayan Ekler),
-- longest first to prevent partial prefix matches.
-- f, r: harmony class of the morpheme's last vowel (update running state).
-- voiced: voicing of the morpheme's last character.
------------------------------------------------------------------------
local INVARIANT = {
  -- ı = \xC4\xB1
  {str="mt\xC4\xB1rak", f=false, r=false, voiced=false}, -- mtırak (a, last=k)
  {str="leyin",          f=true,  r=false, voiced=true},  -- leyin  (i, last=n)
  {str="gil",            f=true,  r=false, voiced=true},  -- gil    (i, last=l)
  {str="ken",            f=true,  r=false, voiced=true},  -- ken    (e, last=n)
  {str="yor",            f=false, r=true,  voiced=true},  -- yor    (o, last=r)
  {str="ki",             f=true,  r=false, voiced=true},  -- ki     (i, last=i)
}

-- Pre-compute char counts and codepoint arrays for invariant morphemes.
for _, m in ipairs(INVARIANT) do
  local cnt, cps = 0, {}
  for _, cp in utf8.codes(m.str) do cnt = cnt + 1; cps[cnt] = cp end
  m.char_count = cnt
  m.cps = cps   -- lowercase codepoints for case-insensitive matching
end

-- Case-insensitive check: does chars[idx..] start with morpheme m?
local function matches_morpheme(chars, idx, m)
  if idx + m.char_count - 1 > #chars then return false end
  for j = 1, m.char_count do
    if chars[idx + j - 1].lower ~= m.cps[j] then return false end
  end
  return true
end

------------------------------------------------------------------------
-- harmonize(suffix, props)
--   Returns prefix (leading non-letter punctuation), algo (transformed letters).
--   The caller concatenates them; splitting allows override lookup on algo alone.
------------------------------------------------------------------------
local function harmonize(suffix, props)
  local f           = props.f
  local r           = props.r
  local last_voiced = props.v or props.d

  -- Step 0: extract leading non-letter prefix (e.g. apostrophe).
  local prefix   = ""
  local lsuffix  = ""
  local found    = false
  for p, cp in utf8.codes(suffix) do
    if is_letter(cp) then
      lsuffix = suffix:sub(p)
      found   = true
      break
    end
    prefix = prefix .. utf8.char(cp)
  end
  if not found then return prefix, "" end

  -- Build codepoint list with case info: {cp, lower, upper}
  local chars = {}
  for _, cp in utf8.codes(lsuffix) do
    local lo = UPPER_TO_LOWER[cp]
    chars[#chars + 1] = {cp=cp, lower=lo or cp, upper=lo ~= nil}
  end
  local nch = #chars
  if nch == 0 then return prefix, "" end

  local result = ""
  local idx    = 1

  -- ── Step 1: kaynaştırma harfi / ünlü düşmesi ─────────────────────
  -- Applies only when stem ends in a vowel and suffix starts with one.
  if props.v and HARMONY_V[chars[1].lower] then
    if nch == 1 then
      -- Case A: single-vowel suffix → y buffer (accusative default)
      result = result .. "y"
    elseif nch == 2 and chars[2].lower == 0x6E then
      -- Case B: V + n → n buffer (genitive)
      result = result .. "n"
    else
      -- Case C: all other vowel-initial suffixes → drop initial vowel
      idx = 2
    end
  end

  -- ── Step 2: character-by-character scan ──────────────────────────
  while idx <= nch do
    local ch = chars[idx]
    local cp = ch.lower   -- lowercase codepoint drives all logic
    local up = ch.upper   -- true if original character was uppercase

    -- Check for invariant morpheme (case-insensitive).
    local hit = false
    for _, m in ipairs(INVARIANT) do
      if matches_morpheme(chars, idx, m) then
        -- Output original characters to preserve case.
        for j = 0, m.char_count - 1 do
          result = result .. utf8.char(chars[idx + j].cp)
        end
        f           = m.f
        r           = m.r
        last_voiced = m.voiced
        idx         = idx + m.char_count
        hit         = true
        break
      end
    end
    if hit then goto continue end

    -- daş/taş: leading consonant alternates, vowel 'a' is invariant.
    -- ş = U+015F
    if idx + 2 <= nch then
      local c1 = chars[idx].lower
      local c2 = chars[idx + 1].lower
      local c3 = chars[idx + 2].lower
      if (c1 == 0x64 or c1 == 0x74) and c2 == 0x61 and c3 == 0x015F then
        result = result
          .. apply_case(last_voiced and 0x64 or 0x74, chars[idx].upper)
          .. apply_case(0x61,   chars[idx + 1].upper)
          .. apply_case(0x015F, chars[idx + 2].upper)
        f           = false   -- 'a' is back
        r           = false   -- 'a' is unrounded
        last_voiced = false   -- ş is voiceless
        idx         = idx + 3
        goto continue
      end
    end

    if cp == 0x64 or cp == 0x74 then           -- d / t: hardening or softening
      if last_voiced then
        result = result .. apply_case(0x64, up); last_voiced = true
      else
        result = result .. apply_case(0x74, up); last_voiced = false
      end

    elseif cp == 0x63 or cp == 0x00E7 then     -- c / ç
      if last_voiced then
        result = result .. apply_case(0x63,   up); last_voiced = true
      else
        result = result .. apply_case(0x00E7, up); last_voiced = false
      end

    elseif cp == 0x65 or cp == 0x61 then       -- e / a: 2-way harmony
      result = result .. apply_case(f and 0x65 or 0x61, up); last_voiced = true

    elseif HARMONY_V[cp] then                  -- i ı ü u: 4-way harmony
      result = result .. apply_case(vowel4cp(f, r), up); last_voiced = true

    elseif INVARIANT_V[cp] then                -- o / ö: pass through, preserve case
      result = result .. apply_case(cp, up); last_voiced = true

    else                                        -- all other characters: verbatim
      result = result .. utf8.char(ch.cp)
      last_voiced = is_voiced(cp)
    end

    idx = idx + 1
    ::continue::
  end

  return prefix, result
end

------------------------------------------------------------------------
-- Public API
------------------------------------------------------------------------
-- overrides[n][algo] = replacement
-- algo is the letter-only suffix the algorithm produces (e.g. "te", "üncü", "da").
local overrides = {}

function M.set_override(n, algo_output, replacement)
  local k = math.floor(tonumber(n) or 0)
  if not overrides[k] then overrides[k] = {} end
  overrides[k][algo_output] = replacement
end

function M.remove_override(n, algo_output)
  local k = math.floor(tonumber(n) or 0)
  if overrides[k] then
    overrides[k][algo_output] = nil
  end
end

M.debug = false

function M.output_suffix(n_str, suffix)
  -- For multi-level refs like "2.3" use the last component (3 = üç, not 2 = iki).
  -- tostring() preserves the full string when n_str is already a string (e.g. "1.10").
  local last = tostring(n_str):match("[^.]+$") or tostring(n_str)
  local n = math.floor(tonumber(last) or 0)

  if M.debug then
    texio.write_nl("turkce-sayi: n=" .. n .. "  suffix=[" .. suffix .. "]")
  end

  local props        = last_word(n)
  local prefix, algo = harmonize(suffix, props)

  if overrides[n] then
    local replacement = overrides[n][algo]
    if replacement then algo = replacement end
  end

  if M.debug then
    texio.write_nl("turkce-sayi: result=[" .. prefix .. algo .. "]")
  end

  tex.sprint(prefix .. algo)
end

-- bib_output_suffix(keys, suffix)
--   keys: comma-separated cite key list (e.g. "smith,jones")
--   Looks up the citation number of the LAST key via token.get_macro("b@<key>")
--   and outputs the transformed suffix.  Requires two LaTeX passes (the cite
--   number is written to .aux on pass 1 and read back on pass 2).
function M.bib_output_suffix(keys, suffix)
  local last_key = keys:match("[^,]+$")
  if last_key then
    last_key = last_key:match("^%s*(.-)%s*$")
  end
  local n = 0
  if last_key and last_key ~= "" then
    local val = token.get_macro("b@" .. last_key)
    n = tonumber(val) or 0
  end
  M.output_suffix(n, suffix)
end

return M