"""Streamlit app to auto-fill missing/empty <img> alt attributes in a WordPress XML export

Features
--------
1. Upload a WordPress XML (WXR) file.
2. Detect every <img> tag lacking a non-empty alt attribute.
3. Download each image (with optional HTTP Basic Auth) and send it to the OpenAI
   Responses API (vision) to obtain an accessibility-friendly alt text.
4. Inject the generated alt text back into the <img> tag, preserving the rest of
   the XML verbatim.
5. Display real-time progress and summary statistics.
6. Provide a download button for the updated XML.

Environment variables (via .env or shell):
    OPENAI_API_KEY      – required for OpenAI calls.
    BASIC_AUTH_USER     – staging basic-auth user (optional).
    BASIC_AUTH_PWD      – staging basic-auth password (optional).
"""
from __future__ import annotations

import base64
import io
import os
import re
from typing import Dict, Tuple, Optional

import requests
import streamlit as st
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI
from datetime import datetime
import pandas as pd
from requests.auth import HTTPBasicAuth

# ---------------------------------------------------------------------------
# Initialisation
# ---------------------------------------------------------------------------
load_dotenv()
openai_client = OpenAI()  # api_key picked up automatically from env / .env

# ---------------------------------------------------------------------------
# Utility helpers
# ---------------------------------------------------------------------------
IMG_TAG_REGEX = re.compile(r"<img[^>]*>", re.IGNORECASE)
SRC_REGEX = re.compile(r"src=[\"']([^\"']+)[\"']", re.IGNORECASE)
ALT_REGEX = re.compile(r"alt=[\"']([^\"']*)[\"']", re.IGNORECASE)


def extract_img_tags(xml_text: str) -> Tuple[list[str], list[str]]:
    """Return two lists: (all <img> tags, those missing/empty alt)."""
    tags = IMG_TAG_REGEX.findall(xml_text)
    missing_alt = []
    for tag in tags:
        alt_match = ALT_REGEX.search(tag)
        if alt_match is None or alt_match.group(1).strip() == "":
            missing_alt.append(tag)
    return tags, missing_alt


def get_image_bytes(url: str, auth: Tuple[str, str] | None = None) -> bytes | None:
    try:
        response = requests.get(url, timeout=20, auth=HTTPBasicAuth(*auth) if auth else None)
        response.raise_for_status()
        return response.content
    except Exception as exc:  # noqa: BLE001
        st.warning(f"Failed to download image {url}: {exc}")
        return None


BLACK_TRIANGLE_PHRASE = "Triángulo negro debido a regulación de producto"
BLACK_TRIANGLE_KEYWORDS = [
    "black-triangle",
    "black_triangle",
    "triangulo-negro",
    "triangulonegro",
    "triángulo-negro",
]

def looks_like_black_triangle(src: str) -> bool:
    lower = src.lower()
    return any(k in lower for k in BLACK_TRIANGLE_KEYWORDS)


def generate_alt_text(image_bytes: bytes, context: str = "", force_black_triangle: bool = False) -> Optional[str]:
    if force_black_triangle:
        return BLACK_TRIANGLE_PHRASE
    b64_image = base64.b64encode(image_bytes).decode("utf-8")
    prompt = (
        "Eres un experto en accesibilidad web. Genera un texto alternativo breve (máx 125 caracteres, en español) "
        "para describir la siguiente imagen y su función, teniendo en cuenta el contexto del artículo. "
        f"Contexto: {context}. "
        "Devuelve únicamente el texto alternativo, sin comillas. Si la imagen es un triángulo negro, ignorar el contexto y devolver únicamente la frase 'Triángulo negro debido a regulación de producto'. Hay logos que no se corresponden con el triangulo negro, aunque puedan contener triangulo negro, en este caso si puedes usar contexto y explicar de qué es el logo."  # no quotes
    )
    try:
        response = openai_client.responses.create(
            model="gpt-4o-mini",
            input=[
                {
                    "role": "user",
                    "content": [
                        {"type": "input_text", "text": prompt},
                        {"type": "input_image", "image_url": f"data:image/png;base64,{b64_image}"},
                    ],
                }
            ],
        )
        text = getattr(response, "output_text", "").strip()
        return text.strip('\"').strip("'")
    except Exception as exc:  # noqa: BLE001
        st.error(f"OpenAI Vision error: {exc}")
        return None


# ---------------------------------------------------------------------------
# Streamlit UI
# ---------------------------------------------------------------------------
st.set_page_config(page_title="WP Image Alt-Text Filler", layout="wide")
st.title("🖼️ WordPress XML Alt-Text Filler")

uploaded_file = st.file_uploader("Upload your WordPress export (.xml)", type=["xml"])

# Limit how many <img> tags will be processed. 0 = process all.
num_limit = st.number_input(
    "Max images to process (0 = all)",
    min_value=0,
    value=50,
    step=1,
    help="If set to a positive number, only the first N missing `<img>` tags will be processed; set to 0 to process every tag detected.",
)


use_basic_auth = st.checkbox("Site is behind Basic Auth (staging)")

basic_user = os.getenv("BASIC_AUTH_USER") if use_basic_auth else None
basic_pwd = os.getenv("BASIC_AUTH_PWD") if use_basic_auth else None

if use_basic_auth and (not basic_user or not basic_pwd):
    st.info("Basic-auth credentials not found in .env – enter them below")
    basic_user = st.text_input("Username", value=basic_user or "")
    basic_pwd = st.text_input("Password", type="password", value=basic_pwd or "")

process_btn = st.button("Start Processing", disabled=uploaded_file is None)

if process_btn and uploaded_file is not None:
    xml_bytes: bytes = uploaded_file.read()
    try:
        xml_text = xml_bytes.decode("utf-8")
    except UnicodeDecodeError:
        st.error("Could not decode XML as UTF-8. Please ensure the export is valid.")
        st.stop()

    st.success("XML loaded ✔️")

    _, img_tags_to_fix = extract_img_tags(xml_text)
    if num_limit > 0:
        img_tags_to_fix = img_tags_to_fix[:num_limit]
    total = len(img_tags_to_fix)

    if total == 0:
        st.info("No <img> tags requiring alt text were found. 🎉")
        st.stop()

    st.write(f"Found {total} image(s) missing alt text.")
    progress_bar = st.progress(0)
    status_area = st.empty()

    auth_tuple = (basic_user, basic_pwd) if use_basic_auth and basic_user and basic_pwd else None
    generated_cache: Dict[str, str] = {}
    report_rows = []

    def extract_neighbor_paragraphs(full_text: str, tag_str: str) -> str:
        """Return concatenated text of <p> before and after the tag if present."""
        idx = full_text.find(tag_str)
        if idx == -1:
            return ""
        # Previous paragraph
        prev_end = full_text.rfind("</p>", 0, idx)
        prev_text = ""
        if prev_end != -1:
            prev_start = full_text.rfind("<p", 0, prev_end)
            if prev_start != -1:
                prev_html = full_text[prev_start:prev_end + 4]
                prev_text = BeautifulSoup(prev_html, "html.parser").get_text(" ", strip=True)
        # Next paragraph
        next_text = ""
        next_start_search = idx + len(tag_str)
        next_start = full_text.find("<p", next_start_search)
        if next_start != -1:
            next_end = full_text.find("</p>", next_start)
            if next_end != -1:
                next_html = full_text[next_start:next_end + 4]
                next_text = BeautifulSoup(next_html, "html.parser").get_text(" ", strip=True)
        context_parts = [t for t in [prev_text, next_text] if t]
        return " ".join(context_parts)


    for idx, img_tag in enumerate(img_tags_to_fix, start=1):
        src_match = SRC_REGEX.search(img_tag)
        if not src_match:
            status_area.warning(f"Skipping tag with no src: {img_tag[:60]}…")
            progress_bar.progress(idx / total)
            continue

        src_url = src_match.group(1)

        if looks_like_black_triangle(src_url):
            alt_text = BLACK_TRIANGLE_PHRASE
        elif src_url in generated_cache:
            alt_text = generated_cache[src_url]
        else:
            context_text = extract_neighbor_paragraphs(xml_text, img_tag)
            image_bytes = get_image_bytes(src_url, auth=auth_tuple)
            if image_bytes is None:
                status_area.warning(f"Could not fetch image: {src_url}")
                progress_bar.progress(idx / total)
                continue

            alt_text = generate_alt_text(image_bytes, context=context_text) or "Imagen"
            generated_cache[src_url] = alt_text

        # Add to audit report
        report_rows.append({"image_url": src_url, "alt_text": alt_text})

        # Build a new tag with BeautifulSoup for safe attribute handling
        soup = BeautifulSoup(img_tag, "html.parser")
        img = soup.find("img")
        if img is None:
            status_area.warning("Error parsing img tag – skipping")
            progress_bar.progress(idx / total)
            continue
        img["alt"] = alt_text
        new_tag_str = str(img)

        # Replace only first occurrence of this specific tag instance
        xml_text = xml_text.replace(img_tag, new_tag_str, 1)

        status_area.info(f"Processed {idx}/{total}")
        progress_bar.progress(idx / total)

    st.success("All done! 🎉")

    # Summary
    st.subheader("Summary")
    st.write(f"Total images processed: {total}")
    st.write(f"Unique images fetched: {len(generated_cache)}")

    # Audit report
    if report_rows:
        df_report = pd.DataFrame(report_rows)
        st.subheader("Audit Report (first 100 rows)")
        st.dataframe(df_report.head(100), use_container_width=True)
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        excel_buffer = io.BytesIO()
        with pd.ExcelWriter(excel_buffer, engine="xlsxwriter") as writer:
            df_report.to_excel(writer, index=False, sheet_name="Alt Text")
        excel_buffer.seek(0)
        st.download_button(
            label="Download audit report (Excel)",
            data=excel_buffer,
            file_name=f"alt_text_audit_report_{ts}.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        )

    # Download updated XML
    st.download_button(
        label="Download updated XML",
        data=xml_text.encode("utf-8"),
        file_name="wordpress_with_alt.xml",
        mime="application/xml",
    )