"""pdf_text_replacer.py

A simple utility to scan a PDF file and replace **every** occurrence of a
search string with a replacement string, producing a new PDF.

Usage (command-line):
    python pdf_text_replacer.py input.pdf "SEARCH TEXT" "REPLACEMENT" [-o output.pdf]

Dependencies:
    PyMuPDF (imported as `fitz`) – install with `pip install PyMuPDF`.

Notes:
    • The implementation relies on PyMuPDF's redaction facility: each found
      text instance is covered with a white rectangle and an overlay text
      containing the replacement string.
    • Formatting (font size, colour, exact position) will not exactly match
      the original — the goal is functional text correction rather than
      pixel-perfect typography.
"""
from __future__ import annotations

import argparse
import os
import sys
from pathlib import Path

import fitz  # PyMuPDF
import pytesseract
from PIL import Image, UnidentifiedImageError
import io

# Disable PIL image size limit to handle large PDF pages
Image.MAX_IMAGE_PIXELS = None


def replace_text_in_pdf(
    input_pdf: str | os.PathLike,
    search_text: str,
    replacement_text: str,
    output_pdf: str | os.PathLike | None = None,
    case_sensitive: bool = True,
) -> str:
    """Replace *all* occurrences of *search_text* in *input_pdf*.

    Args:
        input_pdf: Path to the source PDF.
        search_text: Text to find.
        replacement_text: Text to insert instead.
        output_pdf: Destination path. Defaults to ``<input>_replaced.pdf``.
        case_sensitive: Whether the search should be case-sensitive.

    Returns:
        The path of the written PDF.
    """
    input_path = Path(input_pdf)
    if not input_path.exists():
        raise FileNotFoundError(f"Input PDF not found: {input_path}")

    if output_pdf is None:
        output_pdf = input_path.with_name(f"{input_path.stem}_replaced{input_path.suffix}")
    else:
        output_pdf = Path(output_pdf)

    # Open document *not* in incremental mode so we can save cleanly later.
    doc = fitz.open(str(input_path))

    for page in doc:
        if case_sensitive:
            text_instances = page.search_for(search_text)
        else:
            # OCR-based search for image-based PDFs.
            text_instances = []
            # Render page to an image
            pix = page.get_pixmap(dpi=200)
            img_data = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_data))

            # Use pytesseract to get detailed OCR data
            ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DATAFRAME)
            ocr_data = ocr_data[ocr_data.conf > -1] # Filter out non-textual data

            # Reconstruct text and find matches
            words = list(ocr_data.text)
            search_words = search_text.lower().split()

            for i in range(len(words) - len(search_words) + 1):
                # Create a string from the sequence of words to compare
                word_sequence = [str(w).lower() for w in words[i:i+len(search_words)]]
                # Join the words to form a phrase, as pytesseract might split them
                phrase = " ".join(word_sequence)

                if search_text.lower() in phrase:
                    # Get bounding box for the first and last word of the match
                    first_word_info = ocr_data.iloc[i]
                    last_word_info = ocr_data.iloc[i + len(search_words) - 1]

                    x0 = first_word_info['left']
                    y0 = first_word_info['top']
                    x1 = last_word_info['left'] + last_word_info['width']
                    y1 = last_word_info['top'] + last_word_info['height']
                    
                    # Scale coordinates back to PDF space
                    rect = fitz.Rect(x0, y0, x1, y1) / pix.width * page.rect.width
                    text_instances.append(rect)

        if not text_instances:
            continue

        # Create a redaction annotation for each instance.
        for rect in text_instances:
            page.add_redact_annot(rect, text=replacement_text, fill=(1, 1, 1))

        # Apply all redactions in one go.
        page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)

    # Save to new file (overwrites if exists).
    doc.save(str(output_pdf), garbage=4, deflate=True)
    doc.close()

    return str(output_pdf)


def main(argv: list[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description="Replace text in a PDF file.")
    parser.add_argument("pdf", help="Path to the PDF file")
    parser.add_argument("search", help="Text to search for")
    parser.add_argument("replace", help="Replacement text")
    parser.add_argument(
        "-o",
        "--output",
        help="Output PDF path (default: <input>_replaced.pdf)",
    )
    parser.add_argument(
        "--ignore-case",
        action="store_true",
        help="Perform a case-insensitive search",
    )

    args = parser.parse_args(argv)

    try:
        out_path = replace_text_in_pdf(
            args.pdf,
            args.search,
            args.replace,
            args.output,
            case_sensitive=not args.ignore_case,
        )
        print(f"✔ Replaced text written to: {out_path}")
    except Exception as exc:
        print(f"Error: {exc}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()