#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "jinja2",
# ]
# ///
"""
Generate publications/index.html from publications/publications_svaiter.bib.

Run with:
    uv run publications/scripts/generate_publications.py

Classification rules (from BibTeX entry type + `keywords` field):
    @article                        -> J  (Journal)
    @inproceedings + workshop kw    -> W  (Workshop)
    @inproceedings + frenchconf kw  -> CF (French Conference, e.g. GRETSI)
    @inproceedings (other)          -> C  (International Conference)
    @techreport                     -> P  (Preprint)
    @phdthesis                      -> T  (Thesis; PhD or HDR inferred from key)
    @incollection                   -> B  (Book chapter)

Numbering: within each type, entries are sorted oldest-first so IDs are
stable (J1 is the oldest journal paper). Display order is newest year
first, grouped by type within each year.

Generated by Claude Opus 4.6 on Apr 11, 2026.
"""

from __future__ import annotations

import re
import sys
from pathlib import Path

from jinja2 import Environment, FileSystemLoader

SCRIPT_DIR = Path(__file__).resolve().parent
PUB_DIR = SCRIPT_DIR.parent
ROOT = PUB_DIR.parent
BIB_PATH = PUB_DIR / "publications_svaiter.bib"
OUT_PATH = PUB_DIR / "index.html"
MD_PATH = PUB_DIR / "publications.md"
TEMPLATE_DIR = SCRIPT_DIR
TEMPLATE_NAME = "publications_template.html.j2"

TYPE_CLASS = {"J": "", "C": "", "CF": "cf", "W": "w", "B": "b", "T": "t", "P": "p"}
TYPE_ORDER = {"J": 0, "C": 1, "CF": 2, "W": 3, "B": 4, "T": 5, "P": 6}


# ---------- BibTeX parser (minimal, hand-rolled) ----------

def parse_bib(text: str) -> list[dict]:
    """Return a list of entries as dicts with keys plus `_type` and `_key`."""
    # Collect @string definitions: @string{name = "value"}
    strings: dict[str, str] = {}
    for m in re.finditer(r'@string\s*\{\s*(\w+)\s*=\s*"([^"]*)"\s*\}', text, re.IGNORECASE):
        strings[m.group(1)] = m.group(2)
    # Strip @string lines so they don't confuse the main scan.
    text = re.sub(r'@string\s*\{[^}]*\}', '', text, flags=re.IGNORECASE)

    entries: list[dict] = []
    i = 0
    while True:
        m = re.search(r'@(\w+)\s*\{\s*([^,\s]+)\s*,', text[i:])
        if not m:
            break
        etype = m.group(1).lower()
        if etype == "comment":
            i += m.end()
            continue
        key = m.group(2).strip()
        start = i + m.end()
        # Walk forward to matching closing brace, tracking depth.
        depth = 1
        j = start
        in_str = False
        while j < len(text) and depth > 0:
            c = text[j]
            if c == '"' and text[j - 1] != '\\':
                in_str = not in_str
            elif not in_str:
                if c == '{':
                    depth += 1
                elif c == '}':
                    depth -= 1
                    if depth == 0:
                        break
            j += 1
        body = text[start:j]
        fields = _parse_fields(body, strings)
        fields["_type"] = etype
        fields["_key"] = key
        entries.append(fields)
        i = j + 1
    return entries


def _parse_fields(body: str, strings: dict[str, str]) -> dict:
    fields: dict = {}
    i = 0
    n = len(body)
    while i < n:
        while i < n and body[i] in " \t\n\r,":
            i += 1
        if i >= n:
            break
        m = re.match(r'(\w+)\s*=\s*', body[i:])
        if not m:
            break
        name = m.group(1).lower()
        i += m.end()
        if i >= n:
            break
        # Value: {...}, "...", or a bareword string macro
        if body[i] == '{':
            depth = 1
            j = i + 1
            while j < n and depth > 0:
                if body[j] == '{':
                    depth += 1
                elif body[j] == '}':
                    depth -= 1
                    if depth == 0:
                        break
                j += 1
            value = body[i + 1:j]
            i = j + 1
        elif body[i] == '"':
            j = body.index('"', i + 1)
            value = body[i + 1:j]
            i = j + 1
        else:
            m2 = re.match(r'(\w+)', body[i:])
            if not m2:
                i += 1
                continue
            word = m2.group(1)
            value = strings.get(word, word)
            i += m2.end()
        fields[name] = value.strip()
    return fields


# ---------- LaTeX cleanup ----------

_LATEX_PAIRS = [
    (r"{\\'e}", "é"), (r"\\'e", "é"),
    (r"{\\'E}", "É"), (r"\\'E", "É"),
    (r"{\\'a}", "á"), (r"\\'a", "á"),
    (r"{\\`e}", "è"), (r"\\`e", "è"),
    (r"{\\`a}", "à"), (r"\\`a", "à"),
    (r"{\\^e}", "ê"), (r"\\^e", "ê"),
    (r"{\\^o}", "ô"), (r"\\^o", "ô"),
    (r'{\\"o}', "ö"), (r'\\"o', "ö"),
    (r"\\c{c}", "ç"),
    (r"\(\ell_{12}\)", "ℓ<sub>12</sub>"),
    (r"\ell_{12}", "ℓ<sub>12</sub>"),
    ("---", "—"),
    ("--", "–"),
]


def clean_latex(s: str) -> str:
    if not s:
        return ""
    for src, dst in _LATEX_PAIRS:
        s = s.replace(src, dst)
    return s.strip()


# ---------- Authors ----------

def split_authors(field: str) -> list[str]:
    return [a.strip() for a in re.split(r'\s+and\s+', field) if a.strip()]


def format_author_name(raw: str) -> str:
    raw = clean_latex(raw)
    if "," in raw:
        last, first = [p.strip() for p in raw.split(",", 1)]
    else:
        parts = raw.split()
        if len(parts) == 1:
            return parts[0]
        last = parts[-1]
        first = " ".join(parts[:-1])
    initials = []
    for part in first.split():
        if not part:
            continue
        if "-" in part:
            subs = [s for s in part.split("-") if s]
            initials.append(".-".join(s[0] for s in subs) + ".")
        else:
            initials.append(part[0] + ".")
    return " ".join(initials) + " " + last


def is_me(raw: str) -> bool:
    cleaned = clean_latex(raw)
    return "Vaiter" in cleaned and "Samuel" in cleaned


def render_authors(raws: list[str]) -> str:
    parts = []
    for a in raws:
        name = format_author_name(a)
        parts.append(name)
    if len(parts) == 0:
        return ""
    if len(parts) == 1:
        return parts[0] + "."
    if len(parts) == 2:
        return f"{parts[0]}, and {parts[1]}."
    return ", ".join(parts[:-1]) + ", and " + parts[-1] + "."


# ---------- Classification, venue, links ----------

def classify(entry: dict) -> str:
    t = entry["_type"]
    kw = entry.get("keywords", "")
    if t == "article":
        return "J"
    if t == "inproceedings":
        if "workshop" in kw:
            return "W"
        if "frenchconf" in kw:
            return "CF"
        return "C"
    if t == "techreport":
        return "P"
    if t == "phdthesis":
        return "T"
    if t == "incollection":
        return "B"
    return "?"


def get_venue(entry: dict) -> str:
    t = entry["_type"]
    if t == "article":
        return clean_latex(entry.get("journal", ""))
    if t == "inproceedings":
        v = clean_latex(entry.get("booktitle", ""))
        if "frenchconf" in entry.get("keywords", "") and "GRETSI" not in v:
            v = v + " (GRETSI)"
        return v
    if t == "techreport":
        return "Preprint"
    if t == "phdthesis":
        return "HDR Thesis" if "hdr" in entry["_key"].lower() else "PhD Thesis"
    if t == "incollection":
        bt = clean_latex(entry.get("booktitle", ""))
        return f"Book chapter in <em>{bt}</em>" if bt else "Book chapter"
    return ""


def get_links(entry: dict, pubdir: Path) -> list[tuple[str, str]]:
    links: list[tuple[str, str]] = []
    if "doi" in entry:
        doi = entry["doi"].strip().replace(" ", "%20")
        links.append(("DOI", f"https://doi.org/{doi}"))
    pdf_name = f"{entry['_key']}.pdf"
    if (pubdir / "pdfs" / pdf_name).exists():
        links.append(("PDF", f"pdfs/{pdf_name}"))
    elif (pubdir / pdf_name).exists():
        links.append(("PDF", pdf_name))
    if "eprint" in entry:
        arxiv_id = entry["eprint"].replace("arXiv:", "").strip()
        links.append(("arXiv", f"https://arxiv.org/abs/{arxiv_id}"))
    if "hal" in entry:
        hal_id = entry["hal"].strip()
        links.append(("HAL", f"https://hal.science/{hal_id}"))
    site_dir = pubdir / entry["_key"]
    if site_dir.is_dir() and (site_dir / "index.html").exists():
        links.append(("SITE", f"{entry['_key']}/"))
    return links


# ---------- Pipeline ----------

def process(entries: list[dict], pubdir: Path) -> list[dict]:
    out = []
    for e in entries:
        title = clean_latex(e.get("title", "")).rstrip(".").strip()
        t = classify(e)
        out.append({
            "key": e["_key"],
            "year": int(e.get("year", "0") or "0"),
            "title": title,
            "authors_html": render_authors(split_authors(e.get("author", ""))),
            "authors_text": render_authors(split_authors(e.get("author", ""))),
            "venue": get_venue(e),
            "note": clean_latex(e.get("note", "")),
            "links": get_links(e, pubdir),
            "type": t,
            "type_class": TYPE_CLASS.get(t, ""),
        })
    return out


def number_by_type(pubs: list[dict]) -> None:
    by_type: dict[str, list[dict]] = {}
    for p in pubs:
        by_type.setdefault(p["type"], []).append(p)
    for t, lst in by_type.items():
        lst.sort(key=lambda x: (x["year"], x["key"]))
        for i, p in enumerate(lst, 1):
            p["number"] = i
            p["id"] = f"{t}{i}"


def render_markdown(years: list[dict]) -> str:
    """Render a plain markdown list, one line per publication, grouped by year."""
    lines: list[str] = ["# Publications", ""]
    for y in years:
        lines.append(f"## {y['year']}")
        lines.append("")
        for p in y["entries"]:
            authors = p["authors_text"]
            title = p["title"]
            venue = p["venue"]
            # Strip HTML tags that may leak in from clean_latex (e.g. <sub>, <em>).
            venue_text = re.sub(r"<[^>]+>", "", venue)
            title_text = re.sub(r"<[^>]+>", "", title)
            link_md = " ".join(f"[{label}]({href})" for label, href in p["links"])
            line = f"- {authors} **{title_text}.** *{venue_text}.*"
            if link_md:
                line += f" {link_md}"
            lines.append(line)
        lines.append("")
    return "\n".join(lines).rstrip() + "\n"


def group_by_year(pubs: list[dict]) -> list[dict]:
    by_year: dict[int, list[dict]] = {}
    for p in pubs:
        by_year.setdefault(p["year"], []).append(p)
    years = []
    for y in sorted(by_year.keys(), reverse=True):
        lst = sorted(
            by_year[y],
            key=lambda x: (TYPE_ORDER.get(x["type"], 99), -x["number"]),
        )
        years.append({"year": y, "entries": lst})
    return years


def main() -> int:
    if not BIB_PATH.exists():
        print(f"ERROR: {BIB_PATH} not found", file=sys.stderr)
        return 1

    text = BIB_PATH.read_text(encoding="utf-8")
    entries = parse_bib(text)
    pubs = process(entries, BIB_PATH.parent)
    number_by_type(pubs)
    years = group_by_year(pubs)

    env = Environment(
        loader=FileSystemLoader(str(TEMPLATE_DIR)),
        autoescape=False,
        trim_blocks=False,
        lstrip_blocks=False,
    )
    tmpl = env.get_template(TEMPLATE_NAME)
    html = tmpl.render(years=years, total=len(pubs))
    OUT_PATH.write_text(html, encoding="utf-8")

    md = render_markdown(years)
    MD_PATH.write_text(md, encoding="utf-8")

    # Summary for CLI
    counts: dict[str, int] = {}
    for p in pubs:
        counts[p["type"]] = counts.get(p["type"], 0) + 1
    summary = ", ".join(f"{t}={counts[t]}" for t in sorted(counts))
    print(f"Wrote {OUT_PATH.relative_to(ROOT)} — {len(pubs)} entries ({summary})")
    print(f"Wrote {MD_PATH.relative_to(ROOT)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())