import streamlit as st
import pandas as pd
from playwright.sync_api import sync_playwright, Page, Browser, Playwright
import time
import logging
import os
from PIL import Image, ImageChops
import img2pdf
import io
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import tempfile
from pypdf import PdfReader, PdfWriter
try:
    from pypdf.annotations import AnnotationBuilder  # PyPDF ≥ 5.5
except ImportError:  # Older versions
    AnnotationBuilder = None
import copy
import math
import shutil
import atexit

# Setup logging
logging.basicConfig(
    filename='screenshot_capture.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

# --- Global Constants ---
MAX_PDF_HEIGHT_PX = 14000  # Max height for a single PDF page segment in pixels
MAX_PDF_HEIGHT_PT = MAX_PDF_HEIGHT_PX * 72 / 96  # Convert to points (assuming 96 DPI)
MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT = 13900 # Pages taller than this (in px) trigger minimal cleanup
DEFAULT_VIEWPORT_WIDTH = 1920 # Default viewport width in pixels for PDF capture
DEFAULT_FALLBACK_PAGE_HEIGHT_PX = 1080 # Fallback page height if everything else fails
# --- End Global Constants ---

def cleanup_temp_storage_file():
    """Clean up the temporary storage state file if it exists."""
    file_path = st.session_state.get('temp_storage_file_to_delete')
    if file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
            logging.info(f"ATEIXT: Removed temporary storage state file: {file_path}")
            del st.session_state.temp_storage_file_to_delete # Prevent re-attempts
        except OSError as e:
            logging.error(f"ATEIXT: Error removing temporary storage state file {file_path}: {e}")
    elif file_path:
        logging.info(f"ATEIXT: Temporary storage state file {file_path} not found, skipping removal.")

def initialize_playwright():
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    # Launch browser (headed mode still active for debugging, change to True for production)
    browser = playwright.chromium.launch(headless=False) 
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username: str, password: str, login_url: str) -> str | None:
    """Login to the portal using Playwright, save storage state to a temp file, and return the file path."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        st.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        # Use headless=True for the initial login for efficiency, unless debugging login itself
        browser = playwright.chromium.launch(headless=True) 
        context = browser.new_context(
            viewport={"width": 1920, "height": 1080},
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        st.info("Navigating to login page...")
        page.goto(login_url)
        logging.info("Navigated to login page")
        time.sleep(1)
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        st.info("Entering username...")
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        st.info("Proceeding to password step...")
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        st.info("Entering credentials...")
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        st.info("Submitting login...")
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        st.info("Waiting for login confirmation...")
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        st.success("Login successful! Session state captured.")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
            st.session_state.temp_storage_file_to_delete = storage_state_path # Store for atexit cleanup
        
        return storage_state_path # Return the path

    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        st.error(f"Login failed: {e}")
        return None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file) -> pd.DataFrame:
    """Read all data from Excel file, using the first row as header."""
    try:
        # Use the first row as header, ensure it's treated as such.
        df = pd.read_excel(file, header=0)
        logging.info(f"Read Excel file. Shape: {df.shape}. Columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}")
        st.error(f"Error reading Excel file: {e}")
        return pd.DataFrame() # Return empty DataFrame on error to be checked by caller

def generate_custom_filename(type_of_material: str, name: str, codex: str) -> str:
    """Generate a custom PDF filename from Excel data.
       Example: infografia_Flow3Ciclo3Tomás_ES-DSM-00864.pdf"""
    # Clean type_of_material: remove trailing '_'
    cleaned_type_of_material = str(type_of_material).rstrip('_')
    
    # Clean name: remove all non-alphanumeric characters (keeps letters and numbers)
    cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', str(name))
    
    # Codex is used as is, ensure it's a string
    cleaned_codex = str(codex)
    
    # Concatenate parts, filtering out any empty parts that might result from cleaning or empty inputs
    filename_parts = [part for part in [cleaned_type_of_material, cleaned_name, cleaned_codex] if part]
    
    if not filename_parts: # If all parts are empty
        base_name = "default_filename"
        logging.warning("All parts for filename were empty or cleaned to empty. Using 'default_filename.pdf'.")
    else:
        base_name = "_".join(filename_parts)
    
    # Ensure the final base_name is not empty if parts joined to nothing (e.g. just underscores)
    if not base_name.strip('_'):
        base_name = "fallback_filename"
        logging.warning("Filename parts resulted in an empty or underscore-only name. Using 'fallback_filename.pdf'.")

    return f"{base_name}.pdf"

# --- Start: Helper Functions for Screenshot-to-PDF Conversion ---

def trim_image_whitespace(image_path: str, bg_color=(255, 255, 255)):
    """
    Trim whitespace from the bottom and sides of an image.

    Returns
    -------
    Tuple[str, int, int]
        (path_to_trimmed_image, left_offset_px, top_offset_px)
        If no trimming was required or an error occurred, the original image
        path is returned with offsets (0, 0) so that callers can still work
        with consistent return values.
    """
    try:
        img = Image.open(image_path).convert("RGB")
        bg = Image.new("RGB", img.size, bg_color)
        diff = ImageChops.difference(img, bg)
        bbox = diff.getbbox()
        if bbox:
            left, upper, right, lower = bbox
            trimmed_img = img.crop(bbox)
            trimmed_path = image_path.replace(".png", "_trimmed.png")
            trimmed_img.save(trimmed_path, "PNG")
            logging.info(f"Saved trimmed image to {trimmed_path}")
            return trimmed_path, left, upper
        else:
            logging.warning(f"Could not find content box in {image_path}. Returning original path without trimming.")
            return image_path, 0, 0
    except Exception as e:
        logging.error(f"Error trimming image {image_path}: {e}", exc_info=True)
        # Fallback to original image with zero offsets so pipeline can proceed
        return image_path, 0, 0

def convert_image_to_pdf(image_path: str, pdf_path: str):
    """
    Converts a single image file to a PDF.
    """
    try:
        with open(pdf_path, "wb") as f:
            f.write(img2pdf.convert(image_path))
        logging.info(f"Successfully converted {image_path} to {pdf_path}")
    except Exception as e:
        logging.error(f"Error converting image {image_path} to PDF: {e}", exc_info=True)
        raise

# --- End: Helper Functions ---

def remove_trailing_blank_page(pdf_path: str):
    """If the last page of the PDF has no text or XObjects, remove it."""
    try:
        reader = PdfReader(pdf_path)
        total_pages = len(reader.pages)
        if total_pages <= 1:
            return  # nothing to trim
        last_page = reader.pages[-1]
        has_text = False
        try:
            text = last_page.extract_text()
            if text and text.strip():
                has_text = True
        except Exception:
            pass
        has_xobjects = "/XObject" in last_page.get("/Resources", {})
        if not has_text and not has_xobjects:
            # Remove the last page
            writer = PdfWriter()
            for i in range(total_pages - 1):
                writer.add_page(reader.pages[i])
            with open(pdf_path, "wb") as f_out:
                writer.write(f_out)
            logging.info(f"Removed trailing blank page from {pdf_path}")
    except Exception as e:
        logging.warning(f"Could not post-process PDF {pdf_path} to remove blank page: {e}")

def capture_and_save_pdf(page: Page, url: str, output_dir: str, filename: str):
    """Capture full page as PDF, segmented into max 14000px high sections using pypdf."""
    try:
        # --- Start: Capture Browser Console Logs ---
        def log_console_message(msg):
            # Log messages, filtering out any potential noise if necessary
            logging.info(f"[Browser Console] {msg.type.upper()}: {msg.text}")
        
        page.on("console", log_console_message)
        # --- End: Capture Browser Console Logs ---

        page.goto(url, wait_until='load', timeout=60000)
        logging.info(f"Navigated to {url}. [Process/Worker]")

        # Set a very large viewport height to encourage full layout rendering
        large_viewport_height = 30000
        page.set_viewport_size({"width": DEFAULT_VIEWPORT_WIDTH, "height": large_viewport_height})
        logging.info(f"Set viewport to {DEFAULT_VIEWPORT_WIDTH}x{large_viewport_height} for {url}. [Process/Worker]")

        # --- Start: Hide Cookie Consent Icon (e.g., OneTrust) ---
        try:
            logging.info(f"Attempting to hide cookie consent icon for {url}. [Process/Worker]")
            # Common selectors for OneTrust. Adjust if necessary.
            selectors_to_hide = [
                '#onetrust-consent-sdk',
                'div[id^="onetrust-banner-sdk"]',
                '.onetrust-pc-dark-filter',
                '#ot-sdk-btn-floating'
            ]
            for selector in selectors_to_hide:
                page.evaluate(f"(selector) => {{ const el = document.querySelector(selector); if (el) el.style.display = 'none'; }}", selector)
            logging.info(f"Executed JS to hide cookie consent icon for {url}. [Process/Worker]")
        except Exception as e_hide_cookie:
            logging.warning(f"Could not hide cookie consent icon for {url}: {e_hide_cookie}. Interactions might be affected. [Process/Worker]")
        # --- End: Hide Cookie Consent Icon ---

        actual_page_height = None # Initialize to None
        # --- Start: Click Expandable Sections & Scroll ---
        page.wait_for_timeout(3000) # Increased delay for page to settle before interacting
        # NOTE: The expansion and scrolling logic is kept here for the initial load (tab1), but will be replicated inside the tab loop for consistency.

        try:
            # --- Expand any 'Leer más' expandable content ---
            expand_buttons = page.locator('text=/Leer más/i')
            count = expand_buttons.count()
            if count > 0:
                logging.info(f"Found {count} 'Leer más' elements for {url}. Clicking them. [Process/Worker]")
                for i in range(count):
                    try:
                        button = expand_buttons.nth(i)
                        if button.is_visible():
                            button.click(timeout=5000)  # Increased timeout
                            page.wait_for_timeout(1000)  # Delay after click to allow DOM update
                    except Exception as click_err:
                        logging.warning(f"Could not click 'Leer más' element {i} on {url}: {click_err} [Process/Worker]")

            # --- Robustly expand accordion elements (may load lazily) ---
            try:
                # Wait briefly for any accordion icons to appear
                try:
                    page.wait_for_selector('span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown', timeout=5000)
                except Exception:
                    pass  # Not critical if none appear immediately

                max_rounds = 4  # Repeat to catch accordions that load after earlier ones expand
                for round_idx in range(max_rounds):
                    closed_selector = 'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                    closed_buttons = page.locator(closed_selector)
                    closed_count = closed_buttons.count()
                    if closed_count == 0:
                        break  # All accordions are open

                    logging.info(
                        f"Accordion expansion round {round_idx + 1}: "
                        f"{closed_count} closed accordions detected for {url}. [Process/Worker]"
                    )

                    for btn_idx in range(closed_count):
                        try:
                            btn = closed_buttons.nth(btn_idx)
                            if btn.is_visible():
                                btn.click(timeout=5000)
                                page.wait_for_timeout(600)  # Allow DOM to update
                        except Exception as acc_err:
                            logging.warning(
                                f"Could not click accordion element {btn_idx} on {url}: {acc_err} [Process/Worker]"
                            )

                    # Short pause before re-evaluating, to allow lazily loaded accordions to render
                    page.wait_for_timeout(800)

                # Final log of any remaining closed accordions
                remaining_closed = page.locator(
                    'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                ).count()
                logging.info(
                    f"Accordion expansion complete for {url}. Remaining closed accordions: {remaining_closed} [Process/Worker]"
                )
            except Exception as accordion_global_err:
                logging.error(
                    f"Error during accordion expansion on {url}: {accordion_global_err} [Process/Worker]"
                )
            
            logging.info(f"Scrolling page to trigger lazy loading for {url}... [Process/Worker]")
            total_page_height_js = """
            () => {
                return document.documentElement.scrollHeight;
            }
            """
            actual_page_height = page.evaluate(total_page_height_js)
            viewport_height = page.viewport_size['height'] if page.viewport_size else 768
            current_scroll = 0
            scroll_increment = viewport_height * 0.8
            max_scroll_iterations = 50
            iterations = 0
            while current_scroll < actual_page_height and iterations < max_scroll_iterations:
                page.mouse.wheel(0, scroll_increment)
                page.wait_for_timeout(500)
                current_scroll += scroll_increment
                new_total_page_height = page.evaluate(total_page_height_js)
                if new_total_page_height > actual_page_height:
                    actual_page_height = new_total_page_height
                elif page.evaluate("window.scrollY + window.innerHeight >= document.body.scrollHeight - 10"):
                    logging.info(f"Attempting to generate PDF with effective height: {actual_page_height}px for {url} [Process/Worker]")
                    break
                iterations += 1
            if iterations >= max_scroll_iterations:
                logging.warning(f"Max scroll iterations reached for {url}. Final height: {actual_page_height} [Process/Worker]")
            logging.info(f"Finished scrolling for {url}. Final evaluated height: {actual_page_height}px [Process/Worker]")

        except Exception as scroll_err:
            logging.error(f"Error during clicking/scrolling for {url}: {scroll_err} [Process/Worker]")
        # --- End: Click Expandable Sections & Scroll ---

        if actual_page_height is None:
            logging.error(f"Could not determine page height for {url}. Aborting PDF capture.")
            return None

        initial_page_height = actual_page_height # Store height before any potential changes
        logging.info(f"Initial page height for {url}: {initial_page_height}px [Process/Worker]")

        # --- All CSS and JS injection has been removed to simplify rendering. ---
        # We will rely on the browser's native print rendering.

        # Determine if the page is 'long' for logging/future conditional logic.
        # This currently has no effect on rendering as CSS/JS injection is disabled.
        is_long_page = initial_page_height > MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT
        logging.info(f"Page classification: {'Long' if is_long_page else 'Short'}. Note: This currently does not alter rendering. [Process/Worker]")

        # Print media emulation has been removed. initial_page_height is already set.
        # final_page_height will be determined after settling actions.

        # --- Start: Multi-tab Component Handling ---
        # Some pages include a multi-tab component (e.g. efficacy/safety tabs). By default only the first tab
        # would be captured. The following block detects such components and iterates through every tab so that
        # each one is captured in a separate PDF (filename appended with _tabN).
        try:
            tab_items_locator = page.locator("ul.mhh-mcn-v1-tabs-panel-nav__list li")
            tab_count = tab_items_locator.count()
        except Exception:
            tab_items_locator = None
            tab_count = 0

        if tab_count and tab_count > 1:
            logging.info(f"Detected multi-tab component with {tab_count} tabs for {url}. Capturing each tab separately. [Process/Worker]")
            base_name, ext = os.path.splitext(filename)
            captured_paths = []

            for i in range(tab_count):
                try:
                    # Click the tab (even tab 1 to ensure active state)
                    tab_el = tab_items_locator.nth(i)
                    try:
                        tab_el.scroll_into_view_if_needed(timeout=3000)
                    except Exception:
                        pass  # not critical
                    tab_click_target = tab_el.locator('a, button') if tab_el.locator('a, button').count() > 0 else tab_el
                    tab_click_target.click(timeout=5000)
                    page.wait_for_timeout(2000)  # Increased wait after tab click to allow content to load fully

                    # --- Replicate expansion and scrolling logic for each tab ---
                    try:
                        # --- Expand any 'Leer más' expandable content in this tab ---
                        expand_buttons = page.locator('text=/Leer más/i')
                        count = expand_buttons.count()
                        if count > 0:
                            logging.info(f"Found {count} 'Leer más' elements in tab {i+1} for {url}. Clicking them. [Process/Worker]")
                            for j in range(count):
                                try:
                                    button = expand_buttons.nth(j)
                                    if button.is_visible():
                                        button.click(timeout=5000)
                                        page.wait_for_timeout(1000)
                                except Exception as click_err:
                                    logging.warning(f"Could not click 'Leer más' element {j} in tab {i+1} on {url}: {click_err} [Process/Worker]")

                        # --- Expand accordion elements in this tab ---
                        try:
                            page.wait_for_selector('span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown', timeout=5000)
                        except Exception:
                            pass

                        max_rounds = 4
                        for round_idx in range(max_rounds):
                            closed_selector = 'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                            closed_buttons = page.locator(closed_selector)
                            closed_count = closed_buttons.count()
                            if closed_count == 0:
                                break

                            logging.info(
                                f"Accordion expansion round {round_idx + 1} in tab {i+1}: "
                                f"{closed_count} closed accordions detected for {url}. [Process/Worker]"
                            )

                            for btn_idx in range(closed_count):
                                try:
                                    btn = closed_buttons.nth(btn_idx)
                                    if btn.is_visible():
                                        btn.click(timeout=5000)
                                        page.wait_for_timeout(600)
                                except Exception as acc_err:
                                    logging.warning(
                                        f"Could not click accordion {btn_idx} in tab {i+1} on {url}: {acc_err} [Process/Worker]"
                                    )

                            page.wait_for_timeout(800)

                        remaining_closed = page.locator(
                            'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                        ).count()
                        logging.info(
                            f"Accordion expansion complete in tab {i+1} for {url}. Remaining closed: {remaining_closed} [Process/Worker]"
                        )
                    except Exception as accordion_tab_err:
                        logging.error(
                            f"Error during accordion expansion in tab {i+1} on {url}: {accordion_tab_err} [Process/Worker]"
                        )

                    # --- Scroll to trigger lazy loading in this tab ---
                    try:
                        logging.info(f"Scrolling tab {i+1} to trigger lazy loading for {url}... [Process/Worker]")
                        actual_page_height = page.evaluate(total_page_height_js)
                        current_scroll = 0
                        scroll_increment = viewport_height * 0.8
                        iterations = 0
                        while current_scroll < actual_page_height and iterations < max_scroll_iterations:
                            page.mouse.wheel(0, scroll_increment)
                            page.wait_for_timeout(800)  # Increased wait per scroll for big images to load
                            current_scroll += scroll_increment
                            new_total_page_height = page.evaluate(total_page_height_js)
                            if new_total_page_height > actual_page_height:
                                actual_page_height = new_total_page_height
                            iterations += 1
                        logging.info(f"Finished scrolling tab {i+1} for {url}. Final height: {actual_page_height}px [Process/Worker]")
                    except Exception as scroll_tab_err:
                        logging.error(f"Error during scrolling in tab {i+1} for {url}: {scroll_tab_err} [Process/Worker]")

                    # --- Settling actions for this tab ---
                    page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                    page.wait_for_timeout(2000)
                    try:
                        page.wait_for_load_state('networkidle', timeout=15000)  # Increased timeout for network idle
                        logging.info(f"Network idle after settling in tab {i+1} for {url}. [Process/Worker]")
                    except Exception as e_network_idle:
                        logging.warning(f"Timeout waiting for network idle in tab {i+1} for {url}: {e_network_idle}. Proceeding. [Process/Worker]")

                    # Re-evaluate final height
                    final_page_height = page.evaluate(total_page_height_js)
                    logging.info(f"Final height after settling in tab {i+1}: {final_page_height}px for {url}. [Process/Worker]")

                    page.wait_for_timeout(5000)  # Final wait for images to render

                    # --- Capture link info BEFORE screenshot for annotation purposes ---
                    link_infos = []
                    try:
                        possible_selectors = ["a:has-text('Ficha Técnica')", "a:has-text('Ficha Tecnica')"]
                        for sel in possible_selectors:
                            elements = page.locator(sel)
                            cnt = elements.count()
                            for lidx in range(cnt):
                                el = elements.nth(lidx)
                                try:
                                    el.scroll_into_view_if_needed(timeout=3000)
                                except Exception:
                                    pass
                                bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                                href = el.get_attribute('href')
                                if bbox_abs and href:
                                    link_infos.append({"bbox": bbox_abs, "href": href})
                        # CIMA links
                        try:
                            elements = page.locator("a[href*='cima.aemps.es']")
                            cnt_href = elements.count()
                            for hidx in range(cnt_href):
                                el = elements.nth(hidx)
                                try:
                                    el.scroll_into_view_if_needed(timeout=3000)
                                except Exception:
                                    pass
                                bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                                href = el.get_attribute('href')
                                if bbox_abs and href:
                                    link_infos.append({"bbox": bbox_abs, "href": href})
                        except Exception as href_err:
                            logging.warning(f"Could not collect CIMA href link info for {url} (tab {i+1): {href_err} [Process/Worker]")
                    except Exception as link_err:
                        logging.warning(f"Could not collect 'Ficha Técnica' link info for {url} (tab {i+1}): {link_err} [Process/Worker]")

                    variant_filename = f"{base_name}_tab{i+1}{ext}"
                    raw_screenshot_path = os.path.join(output_dir, f"RAW_SCREENSHOT_{variant_filename.replace('.pdf', '.png')}")
                    logging.info(f"Capturing screenshot for tab {i+1} – saving to {raw_screenshot_path}. [Process/Worker]")
                    page.screenshot(path=raw_screenshot_path, full_page=True)

                    # Trim and convert to PDF
                    trimmed_screenshot_path, offset_left, offset_top = trim_image_whitespace(raw_screenshot_path)
                    final_pdf_path = os.path.join(output_dir, variant_filename)
                    convert_image_to_pdf(trimmed_screenshot_path, final_pdf_path)

                    # --- Add annotations if link info present ---
                    try:
                        if link_infos:
                            img = Image.open(trimmed_screenshot_path)
                            img_w, img_h = img.size
                            reader = PdfReader(final_pdf_path)
                            writer = PdfWriter()
                            for p in reader.pages:
                                writer.add_page(p)
                            pdf_page = writer.pages[0]
                            pdf_w = float(pdf_page.mediabox.width)
                            pdf_h = float(pdf_page.mediabox.height)
                            rx = pdf_w / img_w
                            ry = pdf_h / img_h
                            for info in link_infos:
                                bbox = info['bbox']
                                href = info['href']
                                adj_x = (bbox['x'] - offset_left) * rx
                                adj_y_top = (bbox['y'] - offset_top) * ry
                                adj_w = bbox['width'] * rx
                                adj_h = bbox['height'] * ry
                                x_ll = adj_x
                                x_ur = adj_x + adj_w
                                y_ur = pdf_h - adj_y_top
                                y_ll = y_ur - adj_h
                                # ensure minimum clickable area 10x10pt
                                min_side = 10
                                if (x_ur - x_ll) < min_side:
                                    expand = (min_side - (x_ur - x_ll)) / 2
                                    x_ll -= expand
                                    x_ur += expand
                                if (y_ur - y_ll) < min_side:
                                    expand = (min_side - (y_ur - y_ll)) / 2
                                    y_ll -= expand
                                    y_ur += expand
                                x_ll = max(0, x_ll); y_ll = max(0, y_ll)
                                x_ur = min(pdf_w, x_ur); y_ur = min(pdf_h, y_ur)
                                rect = [int(x_ll), int(y_ll), int(x_ur), int(y_ur)]
                                try:
                                    if hasattr(writer, 'add_uri'):
                                        try:
                                            writer.add_uri(0, href, rect, border=[0,0,0])
                                        except TypeError:
                                            writer.add_uri(0, href, rect)
                                    else:
                                        if AnnotationBuilder:
                                            annot = AnnotationBuilder.uri(rect, href, border_width=0)
                                            writer.add_annotation(0, annot)
                                except Exception as uri_err:
                                    logging.warning(f"Failed to add URI annotation for {href} on {url} (tab {i+1}): {uri_err} [Process/Worker]")
                            with open(final_pdf_path, 'wb') as _out:
                                writer.write(_out)
                    except Exception as anno_err:
                        logging.warning(f"Failed to post-process PDF annotations for {url} (tab {i+1}): {anno_err} [Process/Worker]")

                    captured_paths.append(final_pdf_path)

                    # Clean temp images
                    try:
                        os.remove(raw_screenshot_path)
                        if os.path.exists(trimmed_screenshot_path) and trimmed_screenshot_path != raw_screenshot_path:
                            os.remove(trimmed_screenshot_path)
                    except Exception:
                        pass

                    logging.info(f"Successfully captured tab {i+1} for {url} as {variant_filename}. [Process/Worker]")
                except Exception as tab_err:
                    logging.warning(f"Failed to capture tab {i+1} for {url}: {tab_err} [Process/Worker]")

            # If at least one tab PDF was created, return the first one's path so upstream logic remains unchanged.
            if captured_paths:
                return captured_paths[0]
            else:
                return None
        # --- End: Multi-tab Component Handling ---
        
        # --- Start: PDF Generation (Single Full-Page) ---

        # Settling actions for non-tab case
        logging.info(f"Performing final scroll and delays before final height measurement for {url}. [Process/Worker]")
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(2000) # Wait 2 seconds for explicit rendering time

        try:
            logging.info(f"Waiting for network idle for {url}. [Process/Worker]")
            page.wait_for_load_state('networkidle', timeout=10000) # Wait for network activity to cease
            logging.info(f"Network is idle for {url}. [Process/Worker]")
        except Exception as e_network_idle:
            logging.warning(f"Timeout or error waiting for network idle for {url}: {e_network_idle}. Proceeding with PDF capture. [Process/Worker]")

        # Re-evaluate page height AFTER all settling actions
        final_page_height = page.evaluate(total_page_height_js)
        logging.info(f"Final page height for {url} after all settling actions: {final_page_height}px [Process/Worker]")

        # Log if page height changed significantly from initial measurement
        if abs(final_page_height - initial_page_height) > 50:
            logging.info(f"Page height changed for {url} after settling actions: from {initial_page_height}px to {final_page_height}px. [Process/Worker]")

        
        if final_page_height <= 100: # Check for suspiciously small height
            logging.warning(f"Final page height for {url} is suspiciously small ({final_page_height}px). Using initial height as fallback if larger. [Process/Worker]")
            if initial_page_height > final_page_height: # Only use initial_page_height if it's actually larger
                 final_page_height = initial_page_height
                 logging.info(f"Reverted to initial height {initial_page_height}px due to small final height for {url}. [Process/Worker]")

        # Ensure there's a valid height to use for PDF generation.
        if final_page_height <= 0:
            logging.error(f"Final page height for {url} is zero or negative. Using initial height: {initial_page_height}px. [Process/Worker]")
            final_page_height = initial_page_height # Fallback to initial height
            if final_page_height <= 0: # Still zero, fallback to a default
                final_page_height = DEFAULT_FALLBACK_PAGE_HEIGHT_PX
                logging.error(f"Initial height also invalid. Using default fallback height: {DEFAULT_FALLBACK_PAGE_HEIGHT_PX}px for {url}. [Process/Worker]")

        output_filename = os.path.join(output_dir, filename)
        try:
            # --- Start: Screenshot-to-PDF Generation ---
            # 1. Take the full-page screenshot
            logging.info(f"Waiting 5 seconds for final image rendering before screenshot for {url}. [Process/Worker]")
            page.wait_for_timeout(5000)  # Added delay for images to load

            # --- Capture 'Ficha Técnica' / 'Ficha Tecnica' link info before screenshot ---
            link_infos = []
            try:
                possible_selectors = ["a:has-text('Ficha Técnica')", "a:has-text('Ficha Tecnica')"]
                for sel in possible_selectors:
                    elements = page.locator(sel)
                    count_links = elements.count()
                    for idx in range(count_links):
                        el = elements.nth(idx)
                        try:
                            el.scroll_into_view_if_needed(timeout=3000)
                        except Exception:
                            pass
                        # Evaluate bounding box in **document** coordinates so later mapping is correct even if page is scrolled
                        bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                        href = el.get_attribute('href')
                        if bbox_abs and href:
                            link_infos.append({"bbox": bbox_abs, "href": href})

                # --- Also capture dose-specific links that point to the CIMA parent site ---
                try:
                    elements = page.locator("a[href*='cima.aemps.es']")
                    count_links_href = elements.count()
                    for idx in range(count_links_href):
                        el = elements.nth(idx)
                        try:
                            el.scroll_into_view_if_needed(timeout=3000)
                        except Exception:
                            pass
                        bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height };")
                        href = el.get_attribute('href')
                        if bbox_abs and href:
                            link_infos.append({"bbox": bbox_abs, "href": href})
                except Exception as href_err:
                    logging.warning(f"Could not collect CIMA href link info for {url}: {href_err} [Process/Worker]")
            except Exception as link_err:
                logging.warning(f"Could not collect 'Ficha Técnica' link info for {url}: {link_err} [Process/Worker]")

            raw_screenshot_path = os.path.join(output_dir, f"RAW_SCREENSHOT_{filename.replace('.pdf', '.png')}")
            logging.info(f"Taking full-page screenshot for {url} and saving to {raw_screenshot_path}. [Process/Worker]")
            page.screenshot(path=raw_screenshot_path, full_page=True)

            # 2. Trim the whitespace from the screenshot
            logging.info(f"Trimming whitespace from {raw_screenshot_path}. [Process/Worker]")
            trimmed_screenshot_path, offset_left, offset_top = trim_image_whitespace(raw_screenshot_path)
            if not trimmed_screenshot_path:
                raise Exception("Failed to trim screenshot whitespace.")

            # 3. Convert the trimmed screenshot to PDF
            logging.info(f"Converting trimmed screenshot {trimmed_screenshot_path} to PDF at {output_filename}. [Process/Worker]")
            convert_image_to_pdf(trimmed_screenshot_path, output_filename)

            # --- Add clickable link annotation to PDF if link info available ---
            try:
                if link_infos:
                    img = Image.open(trimmed_screenshot_path)
                    img_width, img_height = img.size

                    reader = PdfReader(output_filename)
                    writer = PdfWriter()
                    for p in reader.pages:
                        writer.add_page(p)
                    pdf_page = writer.pages[0]
                    pdf_width = float(pdf_page.mediabox.width)
                    pdf_height = float(pdf_page.mediabox.height)
                    ratio_x = pdf_width / img_width
                    ratio_y = pdf_height / img_height

                    for info in link_infos:
                        bbox = info['bbox']
                        href = info['href']
                        adj_x = (bbox['x'] - offset_left) * ratio_x
                        adj_y_top = (bbox['y'] - offset_top) * ratio_y
                        adj_width = bbox['width'] * ratio_x
                        adj_height = bbox['height'] * ratio_y
                        x_ll = adj_x
                        x_ur = adj_x + adj_width
                        y_ur = pdf_height - adj_y_top
                        y_ll = y_ur - adj_height
                        # Ensure a minimum clickable area (10x10 points) to improve usability.
                        min_side = 10  # points
                        if (x_ur - x_ll) < min_side:
                            expand = (min_side - (x_ur - x_ll)) / 2
                            x_ll -= expand
                            x_ur += expand
                        if (y_ur - y_ll) < min_side:
                            expand = (min_side - (y_ur - y_ll)) / 2
                            y_ll -= expand
                            y_ur += expand
                        # Clamp to page boundaries
                        x_ll = max(0, x_ll)
                        y_ll = max(0, y_ll)
                        x_ur = min(pdf_width, x_ur)
                        y_ur = min(pdf_height, y_ur)
                        rect = [int(x_ll), int(y_ll), int(x_ur), int(y_ur)]
                        logging.info(f"Adding link annotation for {href} with rect {rect} on {url} [Process/Worker]")
                        try:
                            # Prefer built-in helper if available (pypdf >=5)
                            if hasattr(writer, 'add_uri'):
                                # Pass a zero-width border so the clickable rectangle has no visible outline.
                                try:
                                    writer.add_uri(0, href, rect, border=[0, 0, 0])
                                except TypeError:
                                    # Older pypdf versions may not accept the border kwarg – fall back without it.
                                    writer.add_uri(0, href, rect)
                            else:
                                if AnnotationBuilder:
                                    # Newer pypdf: specify border_width=0 so no border is drawn
                                    annot = AnnotationBuilder.uri(rect, href, border_width=0)
                                    writer.add_annotation(0, annot)
                                else:
                                    logging.warning("AnnotationBuilder not available in installed pypdf; URI annotation skipped for this link.")
                        except Exception as uri_err:
                            logging.warning(f"Failed to add URI annotation for {href} on {url}: {uri_err} [Process/Worker]")

                    with open(output_filename, 'wb') as _out:
                        writer.write(_out)
            except Exception as anno_err:
                logging.warning(f"Failed to post-process PDF annotations for {url}: {anno_err} [Process/Worker]")
            
            logging.info(f"Successfully created PDF from screenshot for {url} at {output_filename}. [Process/Worker]")

            # 4. Clean up temporary image files
            try:
                os.remove(raw_screenshot_path)
                if os.path.exists(trimmed_screenshot_path) and trimmed_screenshot_path != raw_screenshot_path:
                    os.remove(trimmed_screenshot_path)
                logging.info(f"Cleaned up temporary screenshots for {url}. [Process/Worker]")
            except Exception as e_cleanup:
                logging.warning(f"Could not clean up temporary screenshots for {url}: {e_cleanup}. [Process/Worker]")

            return output_filename
            # --- End: Screenshot-to-PDF Generation ---
        except Exception as e_ss_pdf:
            logging.error(f"Failed to generate PDF from screenshot for {url}: {e_ss_pdf}", exc_info=True)
            return None


    except Exception as e:
        logging.error(f"Error in capture_and_save_pdf for {url} [Process/Worker]: {e}", exc_info=True)
        # Clean up temp tall PDF if error occurred mid-process
        if 'temp_tall_pdf_path' in locals() and temp_tall_pdf_path and os.path.exists(temp_tall_pdf_path):
            try:
                os.remove(temp_tall_pdf_path)
            except Exception as e_remove_err:
                logging.warning(f"Could not remove temp tall PDF {temp_tall_pdf_path} during error cleanup: {e_remove_err} [Process/Worker]")
        return None

def process_worker_task(storage_state_path: str, url: str, output_dir: str, filename: str):
    """Worker function to run in a separate process. Returns url, path to PDF, and the filename used."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        logging.info(f"Process worker starting for URL: {url}")
        # Initialize Playwright within the process
        playwright = sync_playwright().start()
        # Launch browser within the process (headless is generally better for workers)
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        try:
            with open(storage_state_path, 'r') as f:
                storage_state = json.load(f)
        except Exception as load_err:
            logging.error(f"Process worker for {url} failed to load storage state from {storage_state_path}: {load_err}")
            raise # Re-raise the error to signify failure

        # Create context using the loaded state
        context = browser.new_context(
            storage_state=storage_state,
            viewport={"width": DEFAULT_VIEWPORT_WIDTH, "height": 1080}, # Use constant
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Call the capture function
        pdf_result_path = capture_and_save_pdf(page, url, output_dir, filename)
        
        # If PDF was created, attempt to remove trailing blank page
        if pdf_result_path:
            logging.info(f"Attempting to remove trailing blank page from: {pdf_result_path} [Process/Worker]")
            remove_trailing_blank_page(pdf_result_path)
        
        # Return the original URL, the path to the saved PDF (if successful), and the filename used.
        # This helps in mapping results back in the main process.
        return url, pdf_result_path, filename
    except Exception as e:
        logging.error(f"Error in process_worker_task for {url}: {e}", exc_info=True)
        return url, None, filename # Return URL, None for path, and filename for failure tracking
    finally:
        # Ensure cleanup within the process
        if page: 
            try: page.close() 
            except Exception as e: logging.debug(f"Error closing page for {url}: {e}")
        if context: 
            try: context.close() 
            except Exception as e: logging.debug(f"Error closing context for {url}: {e}")
        if browser: 
            try: browser.close() 
            except Exception as e: logging.debug(f"Error closing browser for {url}: {e}")
        if playwright: 
            try: playwright.stop() 
            except Exception as e: logging.debug(f"Error stopping playwright for {url}: {e}")
        logging.info(f"Process worker finished for URL: {url}")

def main():
    atexit.register(cleanup_temp_storage_file) # Register cleanup function
    storage_state_path = None  # Initialize to None
    st.title("MSD Portal Screenshot Capture")
    
    # File upload
    uploaded_file = st.file_uploader("Upload Excel file with URLs", type=['xlsx', 'xls'])
    
    if uploaded_file is not None:
        try:
            input_df = read_excel(uploaded_file)
            if input_df.empty:
                st.error("Failed to read Excel file or it's empty.")
                return

            # Define 0-indexed column indices for data extraction
            # Col B: Type of Material, Col C: Name, Col D: Codex, Col F: URL
            TYPE_COL_IDX = 1 
            NAME_COL_IDX = 2
            CODEX_COL_IDX = 3
            URL_COL_IDX = 5

            # Verify required columns exist by checking indices against df.columns length
            max_required_idx = max(TYPE_COL_IDX, NAME_COL_IDX, CODEX_COL_IDX, URL_COL_IDX)
            if max_required_idx >= len(input_df.columns):
                st.error(f"Excel file is missing required columns. It has {len(input_df.columns)} columns, but needs at least {max_required_idx + 1} for all data (URL, Name, Type, Codex). Please check columns B, C, D, and F.")
                logging.error(f"Excel column check failed. Found columns: {input_df.columns.tolist()}. Required max index: {max_required_idx}")
                return

            input_df['PDF Final Name'] = ""
            input_df['Capture Status'] = "Pending"
            
            tasks_to_submit = []
            for index, row in input_df.iterrows():
                try:
                    # Safely access data using iloc for position-based access
                    type_of_material = str(row.iloc[TYPE_COL_IDX]) if pd.notna(row.iloc[TYPE_COL_IDX]) else ""
                    name = str(row.iloc[NAME_COL_IDX]) if pd.notna(row.iloc[NAME_COL_IDX]) else ""
                    codex = str(row.iloc[CODEX_COL_IDX]) if pd.notna(row.iloc[CODEX_COL_IDX]) else ""
                    url = str(row.iloc[URL_COL_IDX]) if pd.notna(row.iloc[URL_COL_IDX]) else ""

                    if not url.strip() or not url.lower().startswith(('http://', 'https://')):
                        logging.warning(f"Skipping row {index + 2} (Excel row number) due to invalid or empty URL: '{url}'")
                        input_df.loc[index, 'PDF Final Name'] = "SKIPPED_INVALID_URL"
                        input_df.loc[index, 'Capture Status'] = "Skipped (Invalid URL)"
                        continue
                    
                    pdf_final_name = generate_custom_filename(type_of_material, name, codex)
                    input_df.loc[index, 'PDF Final Name'] = pdf_final_name
                    tasks_to_submit.append({'url': url, 'filename': pdf_final_name, 'original_index': index})
                
                except IndexError:
                    err_msg = f"Error processing row {index + 2} in Excel. Not enough columns or incorrect structure."
                    st.warning(err_msg) # Use warning for individual row errors to not halt all
                    logging.error(f"{err_msg} Available columns in row: {len(row)} based on df structure: {len(input_df.columns)}.")
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_STRUCTURE"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
                except Exception as e:
                    err_msg = f"Unexpected error processing row {index + 2} for filename generation: {e}"
                    st.warning(err_msg)
                    logging.error(err_msg, exc_info=True)
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_UNKNOWN"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
            
            urls_to_process_count = len(tasks_to_submit)
            if urls_to_process_count == 0:
                st.info("No valid URLs found to process after initial parsing.")
                # Still provide the modified Excel for download if rows were skipped/errored
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        # Adjust column widths as needed, example:
                        worksheet = writer.sheets['Report']
                        for i, col_name in enumerate(input_df.columns):
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2
                            worksheet.set_column(i, i, column_width)
                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Initial Report (No Captures Done)",
                        data=excel_data,
                        file_name="screenshot_report_initial.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                return

            total_urls = urls_to_process_count # This is the count of tasks actually submitted

            # Login credentials
            username = st.text_input("Username")
            password = st.text_input("Password", type="password")
            login_url = "https://profesionales.msd.es/login"
            
            # storage_state_path is initialized at line 682. Redundant initialization removed.

            if st.button("Start Capture"):
                if not username or not password:
                    st.error("Please enter username and password")
                    return
                
                # Login (once) and get path to storage state file
                storage_state_path = login_to_portal(username, password, login_url)
                if storage_state_path is None:
                    st.error("Login failed. Cannot proceed.")
                    return # Exit if login failed
                
                # Create a unique directory for this batch of PDFs - MOVED HERE
                timestamp = time.strftime("%Y%m%d-%H%M%S")
                pdf_dir = os.path.join("pdf_outputs", timestamp)
                os.makedirs(pdf_dir, exist_ok=True)
                st.success(f"PDFs will be saved in: {os.path.abspath(pdf_dir)}")
                
                progress_bar = st.progress(0.0)
                status_text = st.empty()
                processed_count = 0
                successful_captures = 0

                max_processes = min(os.cpu_count() or 1, total_urls)
                logging.info(f"Using {max_processes} worker processes for {total_urls} tasks.")

                with ProcessPoolExecutor(max_workers=max_processes) as executor:
                    futures = {
                        executor.submit(process_worker_task, storage_state_path, task['url'], pdf_dir, task['filename']): task
                        for task in tasks_to_submit
                    }
                    
                    status_text.text(f"Submitted {total_urls} URLs to {max_processes} processes. Processing...")

                    for future in as_completed(futures):
                        task_info = futures[future]
                        original_index = task_info['original_index']
                        original_url = task_info['url']
                        submitted_filename = task_info['filename'] # This is the 'PDF Final Name'

                        try:
                            # Result from process_worker_task is (url, pdf_result_path, returned_filename)
                            res_url, pdf_result_path, returned_filename = future.result()
                            
                            if pdf_result_path and returned_filename == submitted_filename:
                                successful_captures += 1
                                input_df.loc[original_index, 'Capture Status'] = 'Success'
                                logging.info(f"Successfully processed URL: {res_url} -> PDF: {pdf_result_path} (Filename: {returned_filename})")
                            elif pdf_result_path and returned_filename != submitted_filename:
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Filename Mismatch)'
                                logging.warning(f"Worker returned a PDF but filename mismatched for URL: {original_url}. Expected: {submitted_filename}, Got: {returned_filename}, Path: {pdf_result_path}")
                            else: # pdf_result_path is None
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Worker Returned No PDF)'
                                logging.warning(f"Worker failed to return PDF for URL: {original_url} (Filename: {submitted_filename})")

                        except Exception as exc:
                            logging.error(f"URL {original_url} (Filename: {submitted_filename}) generated an exception in worker process: {exc}", exc_info=True)
                            input_df.loc[original_index, 'Capture Status'] = 'Error (Worker Exception)'
                        finally:
                            processed_count += 1
                            progress = processed_count / total_urls
                            progress_bar.progress(progress)
                            status_text.text(f"Processed {processed_count}/{total_urls} URLs... Success: {successful_captures}")
                
                # --- End of ProcessPoolExecutor block ---

                # Final status update
                status_text.text(f"Processing complete. Processed {processed_count}/{total_urls}.")
                st.success(f"Completed! Successfully captured {successful_captures} out of {total_urls} pages")
                
                # Prepare Excel report for download using the modified input_df
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        worksheet = writer.sheets['Report']
                        # Auto-adjust column widths for all columns in the report
                        for i, col_name in enumerate(input_df.columns):
                            # Calculate max length needed for the column
                            # Consider header length and max data length in the column
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2 # Add a little padding
                            worksheet.set_column(i, i, column_width)

                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Report",
                        data=excel_data,
                        file_name="screenshot_report_final.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                else:
                    st.info("No data to generate a report.")

        except Exception as e:
            st.error(f"An critical error occurred in the main process: {e}")
            logging.error(f"Application error: {e}", exc_info=True)
        finally:
            # No global playwright/browser objects to clean up here anymore
            # Temporary storage state file cleanup is now handled by atexit
            logging.info("Main process try-finally block finished.")

if __name__ == "__main__":
    # Need this guard for ProcessPoolExecutor on some platforms (like Windows)
    main()