"""
Filter noisy scraped package labels and normalize display names to English (best-effort).
"""

from __future__ import annotations

import json
import re
from pathlib import Path

# Promo / UI lines that are not real product packages (e.g. bonus comparisons).
_COMPARISON_EQUALS = re.compile(
    r"\d[0-9.,]*\s*\(\s*=\s*\)\s*\d",
    re.IGNORECASE,
)
_ONLY_NOISE = re.compile(r"^[\s(=).,0-9]+$")

# Mobile Legends / smile.one: real SKUs look like "Diamond×625+81". Scrapes sometimes merge
# two product lines into one title (e.g. "71 Diamonds + 8 78 Diamonds + 8").
_DIAMOND_X_CHUNK = re.compile(
    r"Diamonds?\s*[×xX]\s*\d+(?:\s*\+\s*\d+)?",
    re.IGNORECASE,
)
_DIAMONDS_PLUS_CHUNK = re.compile(
    r"\d{1,7}\s*Diamonds?\s*\+\s*\d+",
    re.IGNORECASE,
)


def _is_mlbb_pass_or_pack_label(s: str) -> bool:
    """
    True for MLBB subscription / pass rows. These often include both a pass title and a
    Diamond×NNN line in the same scraped string; collapsing to bare Diamond×N breaks
    auto-order matching (user picks Weekly Diamond Pass, order uses Diamondx250).
    """
    low = (s or "").lower()
    if "weekly diamond pass" in low:
        return True
    if "twilight pass" in low:
        return True
    if "elite weekly" in low:
        return True
    if "epic monthly" in low:
        return True
    if "diamond pass" in low and ("weekly" in low or "monthly" in low):
        return True
    if re.search(r"\bweekly\b.*\bpass\b|\bpass\b.*\bweekly\b", low):
        return True
    if re.search(r"\bmonthly\b.*\bpass\b|\bpass\b.*\bmonthly\b", low):
        return True
    # Portuguese (if phrase replacement did not run on this fragment)
    if re.search(r"passe\s+semanal.*diamante|semanal\s+de\s+diamante", low):
        return True
    if "passagem" in low and "crep" in low:
        return True
    # English short titles (no "Diamond" word) still must not collapse to Diamond×N
    if "weekly" in low and "pass" in low and "elite" not in low:
        return True
    return False


def sanitize_diamond_merged_labels(name: str) -> str:
    """
    Fix MLBB-style labels where parent DOM text merged two products or promo lines.

    - Prefer a single canonical chunk: Diamond×NNN+MM (drops stray numbers / duplicate phrases).
    - If multiple "N Diamonds + M" segments appear in one string, keep only the first.
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s+", " ", s)

    if _is_mlbb_pass_or_pack_label(s):
        return s

    m = _DIAMOND_X_CHUNK.search(s)
    if m:
        part = m.group(0)
        part = re.sub(r"\s+", "", part)
        part = re.sub(r"(?i)diamonds?", "Diamond", part)
        return part

    blocks = list(_DIAMONDS_PLUS_CHUNK.finditer(s))
    if len(blocks) >= 2:
        a, b = blocks[0].group(0).strip(), blocks[1].group(0).strip()
        ma = re.match(r"^(\d{1,4})\s*Diamonds?", a, re.IGNORECASE)
        mb = re.match(r"^(\d{1,4})\s*Diamonds?", b, re.IGNORECASE)
        # Promo line + real pack merged: prefer the larger diamond count when first is small (e.g. 71 then 78).
        if ma and mb and int(ma.group(1)) <= 99 and int(mb.group(1)) > int(ma.group(1)):
            return b
        return a
    if len(blocks) == 1:
        return blocks[0].group(0).strip()

    # Long merged line without a clean regex match: cut before second large "NNNN Diamonds"
    if len(s) > 55 and s.lower().count("diamonds") >= 2:
        cut = re.search(
            r"^(.+?\d{1,7}\s*Diamonds?\s*\+\s*\d{1,7})(?=\s+\d{3,}\s*Diamonds?)",
            s,
            re.IGNORECASE,
        )
        if cut:
            return cut.group(1).strip()

    return s


def strip_bonus_extra_noise(name: str) -> str:
    """
    Remove smile.one UI fragments often scraped into titles (Where Winds Meet, etc.).
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s*[\n\r]+\s*", " ", s)
    s = re.sub(r"Adicionar\s+aos\s+favoritos", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+(Bonus\s+Extra|B[ôo]nus\s+Extra)\s*$", "", s, flags=re.IGNORECASE)
    s = re.sub(r"^(Bonus\s+Extra|B[ôo]nus\s+Extra)\s+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"(Where\s+Winds\s+Meet)(\s+\1)+", r"\1", s, flags=re.IGNORECASE)
    # smile.one: strip trailing promo / navigation fragments merged into card titles
    s = re.sub(
        r"(?i)\s*[-–—|]?\s*(ver\s+todos|veja\s+mais|mais\s+jogos|outros\s+jogos|jogos\s+populares|hot\s+games|popular\s+games).*$",
        "",
        s,
    )
    s = re.sub(r"\s+", " ", s).strip()
    return _strip_smile_promo_suffixes(s)


def _strip_smile_promo_suffixes(s: str) -> str:
    """
    /br merchant titles often merge promo tails into package card text:
    '+10% DE Bonus', 'Discount DE ATÉ 80%', '12% Off', '15% OFF', '-35% Off', etc.

    Strips both **inline** (DDTank Origin 12% Off, Love Nikki … 15% OFF) and **suffix** promos.
    """
    if not s:
        return s
    prev = None
    while prev != s:
        prev = s
        # Anywhere: smile.one bundles %-off and DE ATÉ text into the same line as the SKU
        s = re.sub(r"(?i)\s*\+\d+\s*%\s*DE\s+Bonus", "", s)
        s = re.sub(r"(?i)\s*Discount\s+DE\s+(?:ATÉ|ATE)(?:\s+\d+\s*%?)?", "", s)
        s = re.sub(r"(?i)\s*-\s*\d+\s*%\s*Off\b", "", s)
        s = re.sub(r"(?i)\s+\d+\s*%\s*Off\b", "", s)
        s = re.sub(r"(?i)\s+\d+\s*%\s*OFF\b", "", s)
        s = re.sub(r"(?i)\s+Code\s+Discounts\s+for\s+.+$", "", s)
        # Suffix-only (avoid eating real words)
        s = re.sub(r"(?i)\s*\+\d+\s*%\s*DE\s+Bonus\s*$", "", s)
        s = re.sub(r"(?i)\s*Discount\s+DE\s+(?:ATÉ|ATE)\s+\d+\s*%\s*$", "", s)
        s = re.sub(r"(?i)\s*Discount\s+DE\s+(?:ATÉ|ATE)\s*$", "", s)
        s = re.sub(r"(?i)\s*DE\s+(?:ATÉ|ATE)\s+\d+\s*%?\s*$", "", s)
        s = re.sub(r"(?i)\s*-\s*\d+\s*%\s*Off\s*$", "", s)
        s = re.sub(r"(?i)\s*\d+\s*%\s*Off\s*$", "", s)
        s = re.sub(r"(?i)\s*\d+\s*%\s*OFF\s*$", "", s)
        s = re.sub(r"(?i)\s*ON\s+SALE\s*$", "", s)
        s = re.sub(r"(?i)\s+Discount\s*$", "", s)
        s = re.sub(r"(?i)\s*\(ROM\)\s*$", "", s)
        s = re.sub(r"\s+", " ", s).strip()
    return s


def collapse_merged_smile_tier_label(name: str) -> str:
    """
    smile.one /br often merges a whole diamond list into one card title:
      "2 SELECIONE O Pack 100 Diamonds + 10% Bonus 310 Diamonds + 10% Bonus …"
    Keep a single SKU line (first tier) when checkout text or multiple tiers are present.
    Run this AFTER phrase replacements so "Bonus/Bônus" matches reliably.
    """
    s = (name or "").strip()
    if not s:
        return s

    # Truncated PT "diaman…" → count as Diamonds for de-merge
    s_work = re.sub(r"(?i)\bdiaman\w*\b", "Diamonds", s)

    diamond_chunks = len(re.findall(r"\d{1,7}\s+Diamonds?", s_work, flags=re.IGNORECASE))
    long_merged = len(s) > 72 and diamond_chunks >= 2
    if not re.search(r"(?i)SELECIONE", s) and not long_merged and diamond_chunks < 2:
        return s

    s = re.sub(r"(?i)^\d+\s+SELECIONE\s+O\s+(?:PACK|PACOTE)\s*", "", s)
    s = re.sub(r"(?i)^SELECIONE\s+O\s+(?:PACK|PACOTE)\s*", "", s)
    s = re.sub(r"(?i)^\d+\s+SELECIONE\s+O\s*", "", s)
    s = re.sub(r"(?i)^SELECIONE\s+O\s*", "", s)
    s = re.sub(r"\s+", " ", s).strip()

    # English "Bonus" + PT "Bônus" / "Bonus" (ASCII)
    tier_bonus = re.compile(
        r"\d{1,7}\s+Diamonds?\s*\+\s*10%\s*(?:B[ôo]nus|Bonus)",
        re.IGNORECASE,
    )
    tiers = list(tier_bonus.finditer(s))
    if tiers:
        return tiers[0].group(0).strip()

    if diamond_chunks >= 2:
        m = re.search(r"(\d{1,7}\s+Diamonds?)", s, re.IGNORECASE)
        if m:
            return m.group(1).strip()
    return s


# PUBG Mobile / smile.one: whole UC tier list sometimes merged into one card title.
_UC_TIER_CHUNK = re.compile(
    r"\d{1,7}(?:\s*\+\s*\d{1,7})?\s+UC\b",
    re.IGNORECASE,
)


def collapse_merged_uc_tier_label(name: str) -> str:
    """
    smile.one /br may merge multiple UC SKUs into one label, e.g.:
      "60 UC 300 + 25 UC 600 + 60 UC 1500 + 300 UC 3000 + 850 UC"
    Keep the first tier only (same idea as multi-tier Diamond collapse).
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s+", " ", s)
    tiers = list(_UC_TIER_CHUNK.finditer(s))
    if len(tiers) < 2:
        return s
    return tiers[0].group(0).strip()


# Honor of Kings / similar: merged "16 TOKENS 80 TOKENS … 2400" in one card title.
_TOKENS_TIER_CHUNK = re.compile(
    r"\d{1,7}(?:\s*\+\s*\d{1,7})?\s+(?:TOKENS|TOKEN)\b",
    re.IGNORECASE,
)


def collapse_merged_tokens_tier_label(name: str) -> str:
    """
    smile.one may concatenate every TOKEN tier into one label (often the highest-price card).
    When multiple "NN (+M)? TOKENS" chunks appear, keep the **last** SKU line so the name
    matches the expensive pack (first tier would show e.g. 16 TOKENS for a 2400-tier price).

    Trailing "2400" without the word TOKENS (line-wrap scrape) is normalized to "2400 TOKENS".
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s+", " ", s)
    tiers = list(_TOKENS_TIER_CHUNK.finditer(s))
    if len(tiers) < 2:
        return s
    last = tiers[-1]
    rest = s[last.end() :].strip()
    orphan = re.fullmatch(r"(\d{1,7}(?:\s*\+\s*\d{1,7})?)", rest)
    if orphan:
        return f"{orphan.group(1)} TOKENS"
    t = last.group(0).strip()
    return re.sub(r"(?i)\s+TOKEN\b$", " TOKENS", t)


# smile.one /br Portuguese: full tier list merged into one card title, e.g.
#   "Pack de 100 Cristais Pack de 500 Cristais … Pack de 5,000 Cr"
# ("Cr" = truncated "Cristais" from DOM/wrap.)
_PACK_PT_PACK_CHUNK = re.compile(
    r"(?:Pack|Pacote)\s+de\s+[\d.,]+\s+(?:Cristais|Cristal|Crystals?|Cr)\b",
    re.IGNORECASE,
)


def collapse_merged_pt_pack_tiers(name: str) -> str:
    """
    Collapse multiple 'Pack de N Cristais' / 'Pacote de …' segments to the last tier
    (matches expensive pack). Normalizes Cristais→Crystals and trailing 'Cr'→Crystals.
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s+", " ", s)
    if not re.search(r"(?i)(?:Pack|Pacote)\s+de\s+", s):
        return s
    tiers = list(_PACK_PT_PACK_CHUNK.finditer(s))
    if len(tiers) < 2:
        return s
    last = tiers[-1].group(0).strip()
    last = re.sub(r"(?i)^Pacote\s+de", "Pack de", last)
    last = re.sub(r"(?i)\bCristais\b", "Crystals", last)
    last = re.sub(r"(?i)\bCristal\b", "Crystal", last)
    last = re.sub(r"(?i)\s+Cr\b", " Crystals", last)
    return last


# Lords Mobile / many non-MLBB merchants: merged tier lists in one card (gems, crystals, CP, etc.).
# UC / TOKENS are handled in separate steps — do not duplicate those units here.
_GEM_DIAMOND_TIER_CHUNK = re.compile(
    r"\d{1,7}(?:\s*\+\s*\d{1,7})?\s+(?:"
    r"Diamantes?|Diamonds?|Gems?|Crystals?|Credits?|Bonds?|Vouchers?|"
    r"Silver\s+Coins?|Copper\s+Coins?|Star\s+Coins?|Rainbow\s+Coins?|"
    r"Rubies?|Ingots?|Shards?|Essences?|Tears?|Souls?|"
    r"Lunacy|Manda|"
    r"Gold|"
    r"CPS|CP"
    r")\b",
    re.IGNORECASE,
)


def _suffix_unit_from_tier_chunk(chunk: str) -> str:
    """Normalize orphan trailing number to the same unit as the last full tier match."""
    c = (chunk or "").strip()
    if re.search(r"(?i)\bDiamantes?\b", c):
        return "Diamonds"
    if re.search(r"(?i)\bCrystals?\b", c):
        return "Crystals"
    if re.search(r"(?i)\bCredits?\b", c):
        return "Credits"
    if re.search(r"(?i)\bBonds?\b", c):
        return "Bonds"
    if re.search(r"(?i)\bVouchers?\b", c):
        return "Vouchers"
    if re.search(r"(?i)\bGems?\b", c):
        return "Gems"
    if re.search(r"(?i)Silver\s+Coins?\b", c):
        return "Silver Coins"
    if re.search(r"(?i)Copper\s+Coins?\b", c):
        return "Copper Coins"
    if re.search(r"(?i)\bGold\b", c):
        return "Gold"
    if re.search(r"(?i)\bCPS\b", c):
        return "CPS"
    if re.search(r"(?i)\bCP\b", c):
        return "CP"
    if re.search(r"(?i)\bRubies?\b", c):
        return "Rubies"
    if re.search(r"(?i)\bIngots?\b", c):
        return "Ingots"
    if re.search(r"(?i)\bShards?\b", c):
        return "Shards"
    if re.search(r"(?i)\bEssences?\b", c):
        return "Essences"
    if re.search(r"(?i)\bTears?\b", c):
        return "Tears"
    if re.search(r"(?i)\bSouls?\b", c):
        return "Souls"
    if re.search(r"(?i)\bLunacy\b", c):
        return "Lunacy"
    if re.search(r"(?i)\bManda\b", c):
        return "Manda"
    if re.search(r"(?i)Star\s+Coins?\b", c):
        return "Star Coins"
    if re.search(r"(?i)Rainbow\s+Coins?\b", c):
        return "Rainbow Coins"
    return "Diamonds"


def collapse_merged_gem_diamond_tier_label(
    name: str, *, product_url: str | None = None
) -> str:
    """
    Same scrape bug as merged TOKENS/UC: the whole gem/diamond tier list in one title.
    MLBB uses a separate pipeline (Diamond×, etc.); do not run this for MLBB URLs.

    When multiple tiers appear, keep the **last** chunk so the label matches the top pack.
    """
    if _is_mlbb_product_url(product_url):
        return (name or "").strip()
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s+", " ", s)
    tiers = list(_GEM_DIAMOND_TIER_CHUNK.finditer(s))
    if len(tiers) < 2:
        return s
    last = tiers[-1]
    rest = s[last.end() :].strip()
    orphan = re.fullmatch(r"(\d{1,7}(?:\s*\+\s*\d{1,7})?)", rest)
    if orphan:
        unit = _suffix_unit_from_tier_chunk(last.group(0))
        return f"{orphan.group(1)} {unit}"
    tail = last.group(0).strip()
    if re.search(r"(?i)\bDiamantes?\b", tail):
        return re.sub(r"(?i)\bDiamantes?\b", "Diamonds", tail)
    return tail


def diamond_base_only_display(name: str) -> str:
    """
    MLBB: show base diamond amount only (drop bonus '+ N' tail the user asked to remove).

    Examples: '563 Diamonds + 73' -> 'Diamondx563', 'Diamondx78+8' -> 'Diamondx78'.
    """
    s = (name or "").strip()
    if not s:
        return s
    if not re.search(r"Diamond", s, re.IGNORECASE):
        return s
    if _is_mlbb_pass_or_pack_label(s):
        return s
    # Already DiamondxN+M or Diamond×N+M -> keep base only (ASCII x for UI / smile.one parity)
    m = re.match(r"^Diamond\s*[×xX]\s*(\d+)\s*\+\s*(\d+)\s*$", s, re.IGNORECASE)
    if m:
        return f"Diamondx{m.group(1)}"
    # "563 Diamonds + 73"
    m2 = re.match(r"^(\d{1,7})\s*Diamonds?\s*\+\s*(\d{1,7})\s*$", s, re.IGNORECASE)
    if m2:
        return f"Diamondx{m2.group(1)}"
    # Plain "211 Diamonds" (matches smile.one Diamond×211 style; avoids duplicate label at same price)
    m3 = re.match(r"^(\d{1,7})\s+Diamonds?\s*$", s, re.IGNORECASE)
    if m3:
        return f"Diamondx{m3.group(1)}"
    return s


def _is_plain_n_diamonds_label(raw: str) -> bool:
    return bool(re.match(r"^\d{1,7}\s+Diamonds?\s*$", (raw or "").strip(), re.IGNORECASE))


def _has_mlbb_diamond_shape_in_raw(raw: str) -> bool:
    """True if raw label looks like smile.one SKU (Diamond×N / DiamondxN), not 'N Diamonds' noise."""
    r = (raw or "").strip()
    return bool(re.search(r"(?i)Diamond\s*[×xX]\s*\d+|Diamondx\d+", r))


def mlbb_drop_plain_duplicate_at_same_price(
    rows: list[dict],
    *,
    product_url: str | None,
    raw_key: str = "raw_name",
) -> list[dict]:
    """
    Same BRL often appears twice: a stray '211 Diamonds' scrape and the real 'Diamond×234' row.
    When both exist, keep the Diamond×/Diamondx-style row only.
    """
    if not _is_mlbb_product_url(product_url):
        return rows

    from collections import defaultdict

    by_cents: dict[int, list[dict]] = defaultdict(list)
    url_ph = product_url and "/ph/" in (product_url or "").lower()
    for row in rows:
        try:
            if url_ph:
                c = int(row.get("php_cents") or 0)
            else:
                c = int(row.get("brl_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        by_cents[c].append(row)

    cent_order: list[int] = []
    seen_cent: set[int] = set()
    for row in rows:
        try:
            if url_ph:
                c = int(row.get("php_cents") or 0)
            else:
                c = int(row.get("brl_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        if c not in seen_cent:
            seen_cent.add(c)
            cent_order.append(c)

    out: list[dict] = []
    for c in cent_order:
        group = by_cents[c]
        has_shape = any(_has_mlbb_diamond_shape_in_raw(r.get(raw_key) or "") for r in group)
        plain_only = [r for r in group if _is_plain_n_diamonds_label(r.get(raw_key) or "")]
        if has_shape and plain_only:
            out.extend([r for r in group if r not in plain_only])
        else:
            out.extend(group)
    return out


# Word/phrase replacements (Portuguese / site jargon → English). Longer phrases first.
# Covers smile.one Brazil pages (Mobile Legends, HOK, Free Fire, etc.).
_PHRASES: list[tuple[re.Pattern[str], str]] = [
    # —— Merchant home cards (all-caps promo lines, subtitles) ——
    (
        re.compile(r"\s+DESCONTOS\s+DE\s+PUBG\s+MOBILE\s*", re.IGNORECASE),
        " - PUBG Mobile discounts",
    ),
    (re.compile(r"\bDESCONTOS\s+DE\s+PUBG\s+MOBILE\b", re.IGNORECASE), "PUBG Mobile discounts"),
    (re.compile(r"\bDESCONTOS\s+DE\b", re.IGNORECASE), "Discounts for"),
    (re.compile(r"\bDESCONTOS\b", re.IGNORECASE), "Discounts"),
    (re.compile(r"Guerra\s+de\s+Reinos", re.IGNORECASE), "War of Kingdoms"),
    (re.compile(r"\bGema\s+do\s+Destino\b", re.IGNORECASE), "Gem of Destiny"),
    (re.compile(r"\bDescontos\b", re.IGNORECASE), "Discounts"),
    (re.compile(r"\bCódigo\b", re.IGNORECASE), "Code"),
    (re.compile(r"\bVoucher\b", re.IGNORECASE), "Voucher"),
    (re.compile(r"\bCart[aã]o\s+Presente\b", re.IGNORECASE), "Gift Card"),
    (re.compile(r"\bCristais\b", re.IGNORECASE), "Crystals"),
    (re.compile(r"\bCristal\b", re.IGNORECASE), "Crystal"),
    # —— Mobile Legends: weekly / monthly packs (full titles as scraped) ——
    (
        re.compile(
            r"Pacote\s+Mensal\s+[ÉE]pico\s*\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+M[eê]s\s*\)",
            re.IGNORECASE,
        ),
        "Epic Monthly Pack (available once per month)",
    ),
    (
        re.compile(
            r"Pacote\s+Semanal\s+Elite\s*\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+Semana\s*\)",
            re.IGNORECASE,
        ),
        "Elite Weekly Pack (available once per week)",
    ),
    # Same without parentheses block (partial scrape)
    (
        re.compile(
            r"Pacote\s+Mensal\s+[ÉE]pico\s*\([^)]*Por\s+M[eê]s[^)]*\)",
            re.IGNORECASE,
        ),
        "Epic Monthly Pack (available once per month)",
    ),
    (
        re.compile(
            r"Pacote\s+Semanal\s+Elite\s*\([^)]*Por\s+Semana[^)]*\)",
            re.IGNORECASE,
        ),
        "Elite Weekly Pack (available once per week)",
    ),
    (re.compile(r"Passagem\s+do\s+crep[úu]sculo", re.IGNORECASE), "Twilight Pass"),
    (re.compile(r"Passe\s+Semanal\s+de\s+Diamante", re.IGNORECASE), "Weekly Diamond Pass"),
    # smile.one /br scrapes sometimes emit English "Pack" + Portuguese parenthetical
    (
        re.compile(
            r"Pack\s+Weekly\s+Elite\s*\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+Semana\s*\)",
            re.IGNORECASE,
        ),
        "Elite Weekly Pack (available once per week)",
    ),
    (
        re.compile(
            r"Pack\s+Monthly\s+Epic\s*\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+M[eê]s\s*\)",
            re.IGNORECASE,
        ),
        "Epic Monthly Pack (available once per month)",
    ),
    (
        re.compile(
            r"Pack\s+Weekly\s+Elite\s*\([^)]*Por\s+Semana[^)]*\)",
            re.IGNORECASE,
        ),
        "Elite Weekly Pack (available once per week)",
    ),
    (
        re.compile(
            r"Pack\s+Monthly\s+Epic\s*\([^)]*Por\s+M[eê]s[^)]*\)",
            re.IGNORECASE,
        ),
        "Epic Monthly Pack (available once per month)",
    ),
    (
        re.compile(r"Pack\s+de\s+Valor\s+por\s+Tempo\s+Limitado", re.IGNORECASE),
        "Limited Time Value Pack",
    ),
    (re.compile(r"\bRecarregar\b", re.IGNORECASE), "Top-up"),
    (
        re.compile(r"\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+Semana\s*\)", re.IGNORECASE),
        " (available once per week)",
    ),
    (
        re.compile(r"\(\s*Dispon[ií]vel\s+Uma\s+Vez\s+Por\s+M[eê]s\s*\)", re.IGNORECASE),
        " (available once per month)",
    ),
    (re.compile(r"\bPack\s+Weekly\s+Elite\b", re.IGNORECASE), "Elite Weekly Pack"),
    (re.compile(r"\bPack\s+Monthly\s+Epic\b", re.IGNORECASE), "Epic Monthly Pack"),
    # —— Free Fire: "100 diamantes+ 10% Bônus" ——
    (
        re.compile(
            r"(\d+)\s*diamantes?\s*\+\s*10%\s*B[ôo]nus",
            re.IGNORECASE,
        ),
        r"\1 Diamonds + 10% Bonus",
    ),
    (re.compile(r"\bB[ôo]nus\b", re.IGNORECASE), "Bonus"),
    # —— Wartune Ultra / other: "Moedas Estelares" (Star Coins) ——
    (re.compile(r"\bMoedas\s+Estelares\b", re.IGNORECASE), "Star Coins"),
    (re.compile(r"\bMoeda\s+Estelar\b", re.IGNORECASE), "Star Coin"),
    # —— Game-specific PT phrases ——
    (re.compile(r"\bRecompensa\s+de\s+Batalha\s+do\s+(\w+)", re.IGNORECASE), r"\1 Battle Reward"),
    (re.compile(r"\bRecompensa\s+de\s+Batalha\b", re.IGNORECASE), "Battle Reward"),
    (re.compile(r"\bBatalhe\s+por\b", re.IGNORECASE), "Battle for"),
    # —— Smile / payment UI ——
    (re.compile(r"\bMoeda\s+Smile\b", re.IGNORECASE), "Smile Coin"),
    (re.compile(r"\bMoedas?\s+Smile\b", re.IGNORECASE), "Smile Coins"),
    (re.compile(r"\bCr[eé]ditos?\s+Smile\b", re.IGNORECASE), "Smile Credits"),
    (re.compile(r"\bRecarga\b", re.IGNORECASE), "Top-up"),
    (re.compile(r"\bComprar\s+agora\b", re.IGNORECASE), "Buy now"),
    # "Pacote de N Ouros" → "Pack of N Gold" (before generic Pacote → Pack)
    (re.compile(r"\bPacote\s+Mensal\s+[ÉE]pico\b", re.IGNORECASE), "Epic Monthly Pack"),
    (re.compile(r"\bPacote\s+Semanal\s+Elite\b", re.IGNORECASE), "Elite Weekly Pack"),
    (re.compile(r"\bPacote\s+de\b", re.IGNORECASE), "Pack of"),
    (re.compile(r"\bPacote\b", re.IGNORECASE), "Pack"),
    (re.compile(r"\bOferta\b", re.IGNORECASE), "Offer"),
    (re.compile(r"\bDesconto\b", re.IGNORECASE), "Discount"),
    (re.compile(r"\bMoeda\b", re.IGNORECASE), "Coin"),
    (re.compile(r"\bMoedas\b", re.IGNORECASE), "Coins"),
    (re.compile(r"\bSem\s+cupom\b", re.IGNORECASE), "No coupon"),
    (re.compile(r"\bCupom\b", re.IGNORECASE), "Coupon"),
    # —— Remaining common words (after full phrases) ——
    (re.compile(r"\bDispon[ií]vel\b", re.IGNORECASE), "Available"),
    (re.compile(r"\bUma\s+Vez\b", re.IGNORECASE), "once"),
    (re.compile(r"\bPor\s+Semana\b", re.IGNORECASE), "per week"),
    (re.compile(r"\bPor\s+M[eê]s\b", re.IGNORECASE), "per month"),
    (re.compile(r"\bSemanal\b", re.IGNORECASE), "Weekly"),
    (re.compile(r"\bMensal\b", re.IGNORECASE), "Monthly"),
    (re.compile(r"\b[ÉE]pico\b", re.IGNORECASE), "Epic"),
    (re.compile(r"\bElite\b", re.IGNORECASE), "Elite"),
    (re.compile(r"\bPassagem\b", re.IGNORECASE), "Pass"),
    (re.compile(r"\bPasse\b", re.IGNORECASE), "Pass"),
    # —— Individual Portuguese words ——
    (re.compile(r"\bOuros\b", re.IGNORECASE), "Gold"),
    (re.compile(r"\bOuro\b", re.IGNORECASE), "Gold"),
    (re.compile(r"\bRecompensa\b", re.IGNORECASE), "Reward"),
    (re.compile(r"\bBatalha\b", re.IGNORECASE), "Battle"),
    (re.compile(r"\bBatalhe\b", re.IGNORECASE), "Battle"),
    (re.compile(r"\bEscolha\b", re.IGNORECASE), "Choice"),
    (re.compile(r"\bEscolher\b", re.IGNORECASE), "Choose"),
    (re.compile(r"\bEstelares\b", re.IGNORECASE), "Star"),
    (re.compile(r"\bEstelar\b", re.IGNORECASE), "Star"),
    (re.compile(r"\bDiamantes\b", re.IGNORECASE), "Diamonds"),
    (re.compile(r"\bDiamante\b", re.IGNORECASE), "Diamond"),
    (re.compile(r"\bPremium\b", re.IGNORECASE), "Premium"),
    (re.compile(r"\bEspecial\b", re.IGNORECASE), "Special"),
    (re.compile(r"\bGrande\b", re.IGNORECASE), "Grand"),
    (re.compile(r"\bPequeno\b", re.IGNORECASE), "Small"),
    (re.compile(r"\bSuper\b", re.IGNORECASE), "Super"),
    (re.compile(r"\bLimitado\b", re.IGNORECASE), "Limited"),
    (re.compile(r"\bExclusivo\b", re.IGNORECASE), "Exclusive"),
    (re.compile(r"\bRaro\b", re.IGNORECASE), "Rare"),
    (re.compile(r"\bGemas\b", re.IGNORECASE), "Gems"),
    (re.compile(r"\bGema\b", re.IGNORECASE), "Gem"),
    (re.compile(r"\bPrata\b", re.IGNORECASE), "Silver"),
    (re.compile(r"\bBronze\b", re.IGNORECASE), "Bronze"),
    (re.compile(r"\bAbra\b", re.IGNORECASE), "Open"),
    (re.compile(r"\bNível\b", re.IGNORECASE), "Level"),
    (re.compile(r"\bN[ií]vel\b", re.IGNORECASE), "Level"),
    # —— Verbs ——
    (re.compile(r"\bComprar\b", re.IGNORECASE), "Buy"),
    (re.compile(r"\bRecarregar\b", re.IGNORECASE), "Recharge"),
    (re.compile(r"\bSelecione\b", re.IGNORECASE), "Select"),
    (re.compile(r"\bAdicionar\b", re.IGNORECASE), "Add"),
    (re.compile(r"\bGanhe\b", re.IGNORECASE), "Earn"),
    (re.compile(r"\bReceber\b", re.IGNORECASE), "Receive"),
    (re.compile(r"\bAtivar\b", re.IGNORECASE), "Activate"),
    (re.compile(r"\bDesbloquear\b", re.IGNORECASE), "Unlock"),
    (re.compile(r"\bDesbloquei[eo]\b", re.IGNORECASE), "Unlock"),
    # —— Prepositions — keep last so full phrases get matched first ——
    (re.compile(r"\bde\b"), "of"),
    (re.compile(r"\bdo\b"), "of"),
    (re.compile(r"\bda\b"), "of"),
    (re.compile(r"\bdos\b"), "of"),
    (re.compile(r"\bdas\b"), "of"),
    (re.compile(r"\bpor\b"), "for"),
    (re.compile(r"\bpara\b"), "for"),
    (re.compile(r"\bcom\b"), "with"),
    (re.compile(r"\bou\b"), "or"),
]


def is_spurious_package_label(name: str) -> bool:
    """
    True if this label is likely UI chrome / promo text, not a purchasable package title.
    """
    n = (name or "").strip()
    if len(n) < 2:
        return True
    if _ONLY_NOISE.match(n):
        return True
    if _COMPARISON_EQUALS.search(n):
        return True
    # Standalone (=) fragments from broken scrapes
    if n in ("(=)", "=", "( = )"):
        return True
    if re.fullmatch(r"\(?\s*=\s*\)?", n):
        return True
    nl = n.lower()
    # smile.one step / payment help lines scraped as "packages"
    if re.search(r"coloque\s+seu\s+id", nl, re.IGNORECASE):
        return True
    if re.search(r"m[eé]todo\s+de\s+pagamento", nl, re.IGNORECASE):
        return True
    if re.search(r"pagamento\s+smile", nl, re.IGNORECASE):
        return True
    # Numbered checkout steps only (e.g. "1 COLOQUE…", "3 MÉTODO…"). Do NOT use a broad
    # "racing master" / "pagamento" tail — real SKUs often include the game name (Racing Master).
    if re.match(r"^\d{1,2}\s+(coloque|m[eé]todo)\b", n, re.IGNORECASE):
        return True
    # smile.one /br checkout: payment method tiles (not game SKUs)
    if re.match(
        r"^(picpay|pix|boleto|lot[eé]rica|loterica|mercado\s+pago|google\s+pay|apple\s+pay|paypal|cart[aã]o)\s*$",
        n,
        re.IGNORECASE,
    ):
        return True
    # Merged payment / Smile Coin promo lines (e.g. "Smile Coin 176 (=) 186 Pix PicPay …")
    if re.search(r"(?i)smile\s+coin", n) and re.search(
        r"(?i)(\(\s*=\s*\)|\bpix\b|\bpicpay\b|lot[eé]rica)",
        n,
    ):
        return True
    if len(n) > 25 and re.search(r"(?i)\bpix\b", n) and re.search(r"(?i)\bpicpay\b", n):
        return True
    if not re.search(r"(?i)diamond|diamante|pass|passe|pack|pacote|twilight|elite|epic", n):
        if len(re.findall(r"(?i)\b(picpay|pix|lot[eé]rica|boleto)\b", n)) >= 2:
            return True
    # Long promo / instruction / description lines scraped as SKU names
    if len(n) > 80 and re.search(
        r"(?i)(compra\s+(est[áa]|s[óo])|disponível\s+apenas|jogador\s+pode|personagem\s+alcan[çc]|renovado\s+a\s+cada)",
        n,
    ):
        return True
    if re.search(r"(?i)^\d+\s+SELECIONE\b", n):
        return True
    if re.search(r"(?i)^SELECIONE\b", n):
        return True
    return False


def _is_mlbb_product_url(product_url: str | None) -> bool:
    u = (product_url or "").lower()
    return "mobilelegends" in u or "mobile-legends" in u or "/mlbb" in u or "m-lbb" in u


def _score_display_name(nm: str) -> int:
    """Higher = better label when deduping same-price rows."""
    if not nm or not nm.strip():
        return 0
    s = nm.strip()
    if re.match(r"^Package\s+\d+$", s, re.IGNORECASE):
        return 2
    if re.match(r"^R\$\s*[\d\.,]+\s*$", s, re.IGNORECASE):
        return 3
    if re.search(r"Diamondx\d+", s, re.IGNORECASE):
        return 8
    if re.search(
        r"\b(gems?|diamonds?|diamantes?|crystals?|credits?|bonds?|vouchers?|"
        r"rubies?|lunacy|ingots?|shards?|essences?|cps|uc|tokens)\b",
        s,
        re.IGNORECASE,
    ):
        return 7
    if re.match(r"^R\$\s*[\d\.,]+\s+\S", s, re.IGNORECASE):
        return 6
    return 5


def dedupe_packages_by_brl_cents(packages: list[dict]) -> list[dict]:
    """
    Same BRL often appears twice (e.g. generic 'Package N' + real title). Remove only
    the generic duplicate when a better label exists. **Do not** collapse two different
    SKUs that share the same price (e.g. different diamond amounts).

    Preserve first-seen price order (like dedupe_packages_by_php_cents) so UI + package_index
    stay aligned with smile.one DOM order — sorting only by brl_cents broke Brazil auto-order.
    """
    from collections import defaultdict

    by_cents: dict[int, list[dict]] = defaultdict(list)
    for p in packages:
        try:
            c = int(p.get("brl_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        by_cents[c].append(p)

    cent_order: list[int] = []
    seen_cent: set[int] = set()
    for p in packages:
        try:
            c = int(p.get("brl_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        if c not in seen_cent:
            seen_cent.add(c)
            cent_order.append(c)

    out: list[dict] = []
    for c in cent_order:
        group = by_cents[c]

        seen_nm: set[str] = set()
        uniq: list[dict] = []
        for p in group:
            nm = (p.get("package_name") or p.get("package_name_en") or "").strip().lower()
            if nm in seen_nm:
                continue
            seen_nm.add(nm)
            uniq.append(p)

        non_pkg = [
            p
            for p in uniq
            if not re.match(r"^Package\s+\d+$", (p.get("package_name") or "").strip(), re.IGNORECASE)
        ]
        if non_pkg:
            out.extend(non_pkg)
        elif uniq:
            out.append(max(uniq, key=lambda x: _score_display_name((x.get("package_name") or "").strip())))

    return out


def dedupe_packages_by_php_cents(packages: list[dict]) -> list[dict]:
    """Same dedupe semantics as BRL, but preserve scrape/DOM order (do not sort by php_cents)."""
    from collections import defaultdict

    by_cents: dict[int, list[dict]] = defaultdict(list)
    for p in packages:
        try:
            c = int(p.get("php_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        by_cents[c].append(p)

    cent_order: list[int] = []
    seen_cent: set[int] = set()
    for p in packages:
        try:
            c = int(p.get("php_cents") or 0)
        except (TypeError, ValueError):
            continue
        if c <= 0:
            continue
        if c not in seen_cent:
            seen_cent.add(c)
            cent_order.append(c)

    out: list[dict] = []
    for c in cent_order:
        group = by_cents[c]

        seen_nm: set[str] = set()
        uniq: list[dict] = []
        for p in group:
            nm = (p.get("package_name") or p.get("package_name_en") or "").strip().lower()
            if nm in seen_nm:
                continue
            seen_nm.add(nm)
            uniq.append(p)

        non_pkg = [
            p
            for p in uniq
            if not re.match(r"^Package\s+\d+$", (p.get("package_name") or "").strip(), re.IGNORECASE)
        ]
        if non_pkg:
            out.extend(non_pkg)
        elif uniq:
            out.append(max(uniq, key=lambda x: _score_display_name((x.get("package_name") or "").strip())))

    return out


def _normalize_page_title_for_prefix(title: str) -> str:
    """Browser tab / merchant title often includes site name after | or dash."""
    s = (title or "").strip()
    if not s:
        return ""
    s = re.sub(r"\s+", " ", s)
    s = re.split(r"\s*[|]\s*", s, 1)[0].strip()
    s = re.split(r"\s+[-–—]\s+", s, 1)[0].strip()
    return s


def strip_shared_page_title_prefix(
    names: list[str],
    page_title: str | None,
    *,
    product_url: str | None = None,
    min_title_len: int = 10,
    max_title_len: int = 120,
) -> list[str]:
    """
    When the scrape puts the same page title in front of every card:
      "Racing Master 500 Gold", "Racing Master 1000 Gold", …
    strip that prefix so rows show the SKU only. Applies only if **every** non-empty
    name starts with the normalized page title (case-insensitive). Skips MLBB URLs.
    """
    if _is_mlbb_product_url(product_url):
        return names
    t = _normalize_page_title_for_prefix(page_title or "")
    if len(t) < min_title_len or len(t) > max_title_len:
        return names
    clean = [(n or "").strip() for n in names]
    if not clean:
        return names
    esc = re.escape(t)
    pat = re.compile(r"^\s*" + esc + r"\s*", re.IGNORECASE)
    for c in clean:
        if not c or not pat.match(c):
            return names
    out: list[str] = []
    for c in clean:
        rest = pat.sub("", c, count=1).lstrip(" \t:-–—|")
        if not rest.strip():
            return names
        out.append(rest.strip())
    return out


_TRASH_PAGE_TITLES = frozenset(
    {
        "no coupon",
        "sem cupom",
        "package",
        "recarregar",
        "comprar agora",
    }
)


def humanize_smile_product_url(url: str | None) -> str:
    """Last meaningful path segment as Title Case (e.g. wherewindsmeet -> Wherewindsmeet)."""
    if not url:
        return ""
    try:
        parts = (url or "").strip().split("?")[0].rstrip("/").split("/")
        for p in reversed(parts):
            pl = p.lower()
            if p and pl not in (
                "merchant",
                "game",
                "pay",
                "entertainment",
                "br",
                "my",
                "th",
                "in",
                "www.smile.one",
            ):
                return p.replace("-", " ").replace("_", " ").strip().title()
    except Exception:
        pass
    return ""


def is_trash_page_title(t: str | None) -> bool:
    x = (t or "").strip().lower()
    if len(x) < 3:
        return True
    if x in _TRASH_PAGE_TITLES:
        return True
    if re.match(r"^package\s*\d*\s*$", x):
        return True
    return False


def effective_page_title(raw: str | None, product_url: str | None) -> str:
    """Use h1 title when sane; otherwise derive a label from the merchant URL."""
    t = (raw or "").strip()
    if not is_trash_page_title(t):
        return t
    return humanize_smile_product_url(product_url) or t


def _strip_smile_site_branding(s: str) -> str:
    """Remove/replace third-party storefront name in scraped titles (customer-facing UI)."""
    if not s:
        return s
    try:
        from config import STORE_DISPLAY_NAME

        brand = (STORE_DISPLAY_NAME or "JungTzy Store").strip()
    except Exception:
        brand = "JungTzy Store"
    val = str(s)
    val = re.sub(r"\s*\|\s*Smile\.One\b.*$", "", val, flags=re.IGNORECASE)
    val = re.sub(r"\s*-\s*Smile\.One\b.*$", "", val, flags=re.IGNORECASE)
    val = re.sub(r"\s*\|\s*smile\.one\b.*$", "", val, flags=re.IGNORECASE)
    val = re.sub(r"\s*-\s*smile\.one\b.*$", "", val, flags=re.IGNORECASE)
    val = re.sub(r"\bSmile\.One\b", brand, val, flags=re.IGNORECASE)
    val = re.sub(r"\bsmile\.one\b", brand, val, flags=re.IGNORECASE)
    val = re.sub(r"\s+", " ", val).strip()
    val = re.sub(r"^\s*\|\s*|\s*\|\s*$", "", val).strip()
    return val


def package_name_to_english(
    name: str,
    *,
    product_url: str | None = None,
    page_title: str | None = None,
) -> str:
    """
    Best-effort English labels for UI.
    MLBB (mobilelegends): Diamond merge + DiamondxN display.
    Other games: strip duplicate R$ in the title; do not force Diamondx (fixes Where Winds Meet / gems).

    page_title: optional h1 / tab title — used to replace generic 'Package N' when the scrape had no card label.
    """
    s = (name or "").strip()
    if not s:
        return s

    eff = effective_page_title(page_title, product_url)
    m_pkg = re.match(r"(?i)^Package\s+(\d+)\s*$", s)
    if m_pkg and eff:
        base = eff if len(eff) <= 52 else eff[:52].strip()
        s = f"{base} · {int(m_pkg.group(1))}"
    elif m_pkg and not eff:
        hb = humanize_smile_product_url(product_url)
        if hb:
            base = hb if len(hb) <= 52 else hb[:52].strip()
            s = f"{base} · {int(m_pkg.group(1))}"

    s = strip_bonus_extra_noise(s)
    for pat, repl in _PHRASES:
        s = pat.sub(repl, s)
    s = re.sub(r"\s+", " ", s).strip()

    # Remove price tokens duplicated from the card (shown separately as MMK)
    s = re.sub(r"^\s*R\$\s*[\d\.,]+\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*R\$\s*[\d\.,]+\s*", " ", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+", " ", s).strip()

    # After PT→EN phrases: split merged multi-tier checkout lines (SELECIONE / long lists)
    s = collapse_merged_smile_tier_label(s)
    s = collapse_merged_uc_tier_label(s)
    s = collapse_merged_tokens_tier_label(s)
    s = collapse_merged_pt_pack_tiers(s)
    s = collapse_merged_gem_diamond_tier_label(s, product_url=product_url)

    if _is_mlbb_product_url(product_url):
        s = re.sub(r"(?i)\bdiamantes?\b", "Diamonds", s)
        s = sanitize_diamond_merged_labels(s)
        s = diamond_base_only_display(s)
    if re.match(r"^000\s+Diamonds?\s*$", (s or "").strip(), re.IGNORECASE):
        s = "10,000 Diamonds"
    return _strip_smile_site_branding(_strip_smile_promo_suffixes(s))


def normalize_package_cache_entry(merchant_url: str, entry: dict) -> dict:
    """
    Re-apply package_name_to_english to cached scrape data (names + title).
    Call after changing display rules so disk cache matches the API.
    """
    blob = dict(entry or {})
    data = dict(blob.get("data") or {})
    raw_title = (data.get("title") or "").strip()
    eff = effective_page_title(raw_title, merchant_url)
    if is_trash_page_title(raw_title) and eff:
        data["title"] = eff
    out_pkgs = []
    for p in data.get("packages") or []:
        nm = package_name_to_english(
            (p.get("name") or ""),
            product_url=merchant_url,
            page_title=raw_title,
        )
        out_pkgs.append({**p, "name": nm})
    data["packages"] = out_pkgs
    blob["data"] = data
    return blob


def rewrite_package_cache_file(path: str | Path) -> int:
    """Rewrite package_cache.json in place; returns number of merchant keys updated."""
    fp = Path(path)
    if not fp.is_file():
        return 0
    raw = json.loads(fp.read_text(encoding="utf-8"))
    if not isinstance(raw, dict):
        return 0
    out = {}
    for url, entry in raw.items():
        out[url] = normalize_package_cache_entry(url, entry if isinstance(entry, dict) else {})
    fp.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
    return len(out)


def sanitize_merchant_display_name(name: str) -> str:
    """
    Clean home-page card titles: strip embedded BRL prices and apply English hints.
    Example: "Mobile Legends: Bang Bang-R$ 15,00" -> label without price text.
    """
    s = (name or "").strip()
    if not s:
        return s
    s = re.sub(r"\s*[-–—]?\s*R\$\s*[\d\.,]+\s*", " ", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*R\$\s*[\d\.,]+\s*", " ", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+", " ", s).strip(" -–—")
    return package_name_to_english(s)
