import streamlit as st
import pandas as pd
from playwright.sync_api import sync_playwright, Page, Browser, Playwright
import time
import logging
import os
from PIL import Image, ImageChops
import img2pdf
import io
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import tempfile
from pypdf import PdfReader, PdfWriter
try:
    from pypdf.annotations import AnnotationBuilder  # PyPDF ≥ 5.5
except ImportError:  # Older versions
    AnnotationBuilder = None
import copy
import math
import shutil
import atexit
from openpyxl import load_workbook
from openpyxl.utils.cell import coordinate_from_string, column_index_from_string
from urllib.parse import urljoin

# Setup logging
logging.basicConfig(
    filename='screenshot_capture.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

# --- Global Constants ---
MAX_PDF_HEIGHT_PX = 14000  # Max height for a single PDF page segment in pixels
MAX_PDF_HEIGHT_PT = MAX_PDF_HEIGHT_PX * 72 / 96  # Convert to points (assuming 96 DPI)
MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT = 13900 # Pages taller than this (in px) trigger minimal cleanup
DEFAULT_VIEWPORT_WIDTH = 1920 # Default viewport width in pixels for PDF capture
DEFAULT_FALLBACK_PAGE_HEIGHT_PX = 1080 # Fallback page height if everything else fails
DEFAULT_BASE_URL = "https://profesionales.msd.es"  # Used to resolve relative paths like '/recursos/...'
# --- End Global Constants ---

def cleanup_temp_storage_file():
    """Clean up the temporary storage state file if it exists."""
    file_path = st.session_state.get('temp_storage_file_to_delete')
    if file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
            logging.info(f"ATEIXT: Removed temporary storage state file: {file_path}")
            del st.session_state.temp_storage_file_to_delete # Prevent re-attempts
        except OSError as e:
            logging.error(f"ATEIXT: Error removing temporary storage state file {file_path}: {e}")
    elif file_path:
        logging.info(f"ATEIXT: Temporary storage state file {file_path} not found, skipping removal.")

def initialize_playwright():
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    # Launch browser (headed mode still active for debugging, change to True for production)
    browser = playwright.chromium.launch(headless=False) 
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username: str, password: str, login_url: str) -> str | None:
    """Login to the portal using Playwright, save storage state to a temp file, and return the file path."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        st.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        # Use headless=True for the initial login for efficiency, unless debugging login itself
        browser = playwright.chromium.launch(headless=True) 
        context = browser.new_context(
            viewport={"width": 1920, "height": 1080},
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        st.info("Navigating to login page...")
        page.goto(login_url)
        logging.info("Navigated to login page")
        time.sleep(1)
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        st.info("Entering username...")
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        st.info("Proceeding to password step...")
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        st.info("Entering credentials...")
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        st.info("Submitting login...")
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        st.info("Waiting for login confirmation...")
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        st.success("Login successful! Session state captured.")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
            st.session_state.temp_storage_file_to_delete = storage_state_path # Store for atexit cleanup
        
        return storage_state_path # Return the path

    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        st.error(f"Login failed: {e}")
        return None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file) -> pd.DataFrame:
    """Read all data from Excel file, using the first row as header."""
    try:
        # Use the first row as header, ensure it's treated as such.
        df = pd.read_excel(file, header=0, sheet_name=0)
        logging.info(f"Read Excel file (first sheet). Shape: {df.shape}. Columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}")
        st.error(f"Error reading Excel file: {e}")
        return pd.DataFrame() # Return empty DataFrame on error to be checked by caller


CELL_REF_RE = re.compile(r"^(?:(?:'([^']+)'|([^'!]+))!)?([$]?[A-Za-z]{1,3}[$]?\d+)$")


def _normalize_sheet_name(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("'") and raw.endswith("'"):
        raw = raw[1:-1].replace("''", "'")
    if ']' in raw:
        raw = raw.split(']')[-1]
    return raw


def _resolve_cell_reference(ref: str, wb_raw, wb_values, default_sheet: str):
    ref = ref.strip()
    sheet_name = default_sheet
    coord = ref
    if '!' in ref:
        sheet_part, coord = ref.split('!', 1)
        sheet_name = _normalize_sheet_name(sheet_part)
    coord_clean = coord.replace('$', '').upper()
    try:
        col_letters, row_index = coordinate_from_string(coord_clean)
    except ValueError:
        return None
    col_index = column_index_from_string(col_letters)
    sheet = None
    if wb_values is not None and sheet_name in wb_values.sheetnames:
        sheet = wb_values[sheet_name]
        value = sheet.cell(row=row_index, column=col_index).value
        if value is not None:
            return value
    if wb_raw is not None and sheet_name in wb_raw.sheetnames:
        sheet = wb_raw[sheet_name]
        return sheet.cell(row=row_index, column=col_index).value
    return None


def _evaluate_literal(token: str | None, wb_raw, wb_values, sheet_name: str):
    if token is None:
        return None
    token = token.strip()
    if not token:
        return None
    if token.startswith('"') and token.endswith('"'):
        return token[1:-1].replace('""', '"')
    if token.upper() in ('TRUE', 'FALSE'):
        return token.upper()
    if re.fullmatch(r'[+-]?\d+(?:\.\d+)?', token):
        return token
    match = CELL_REF_RE.match(token)
    if match:
        sheet_part = match.group(1) or match.group(2)
        coord = match.group(3)
        ref = f"{sheet_part + '!' if sheet_part else ''}{coord}"
        return _resolve_cell_reference(ref, wb_raw, wb_values, sheet_name)
    parts = _split_concat(token)
    if parts:
        evaluated_parts: list[str] = []
        for piece in parts:
            part_val = _evaluate_literal(piece, wb_raw, wb_values, sheet_name)
            evaluated_parts.append('' if part_val is None else str(part_val))
        return ''.join(evaluated_parts)
    func, args = _parse_function(token)
    if func is None:
        return token
    func_upper = func.upper()
    if func_upper.startswith('_XLFN.'):
        func_upper = func_upper.replace('_XLFN.', '')
    if func_upper in ('CONCAT', 'CONCATENATE', 'XLFN.CONCAT'):
        concatenated: list[str] = []
        for arg in args:
            arg_val = _evaluate_literal(arg, wb_raw, wb_values, sheet_name)
            concatenated.append('' if arg_val is None else str(arg_val))
        return ''.join(concatenated)
    if func_upper == 'TEXT':
        if not args:
            return ''
        value = _evaluate_literal(args[0], wb_raw, wb_values, sheet_name)
        return '' if value is None else str(value)
    if func_upper == 'IF':
        if not args:
            return None
        condition_val = _evaluate_literal(args[0], wb_raw, wb_values, sheet_name)
        truthy: bool
        if condition_val is None:
            truthy = False
        elif isinstance(condition_val, str):
            truthy = condition_val.strip().upper() not in ('', '0', 'FALSE', 'NO')
        else:
            truthy = bool(condition_val)
        branch_expr = args[1] if truthy else (args[2] if len(args) > 2 else None)
        return _evaluate_literal(branch_expr, wb_raw, wb_values, sheet_name)
    return token


def _split_concat(expr: str) -> list[str] | None:
    parts: list[str] = []
    current: list[str] = []
    depth = 0
    found = False
    i = 0
    while i < len(expr):
        ch = expr[i]
        if ch == '"':
            current.append(ch)
            i += 1
            while i < len(expr):
                c = expr[i]
                current.append(c)
                if c == '"':
                    if i + 1 < len(expr) and expr[i + 1] == '"':
                        current.append('"')
                        i += 2
                        continue
                    i += 1
                    break
                i += 1
            continue
        if ch == '(':
            depth += 1
        elif ch == ')':
            if depth > 0:
                depth -= 1
        elif depth == 0 and ch == '&':
            parts.append(''.join(current).strip())
            current = []
            found = True
            i += 1
            continue
        current.append(ch)
        i += 1
    if current:
        parts.append(''.join(current).strip())
    return parts if found else None


def _parse_function(expr: str) -> tuple[str | None, list[str]]:
    expr = expr.strip()
    if '(' not in expr:
        return None, []
    func_name, rest = expr.split('(', 1)
    func_name = func_name.strip()
    inner = []
    depth = 0
    token = []
    i = 0
    while i < len(rest):
        ch = rest[i]
        if ch == '"':
            token.append(ch)
            i += 1
            while i < len(rest):
                c = rest[i]
                token.append(c)
                if c == '"':
                    if i + 1 < len(rest) and rest[i + 1] == '"':
                        token.append('"')
                        i += 2
                        continue
                    i += 1
                    break
                i += 1
            continue
        if ch == '(':
            depth += 1
        elif ch == ')':
            if depth == 0:
                inner.append(''.join(token).strip())
                return func_name, inner
            depth -= 1
        elif depth == 0 and ch in (',', ';'):
            inner.append(''.join(token).strip())
            token = []
            i += 1
            continue
        token.append(ch)
        i += 1
    if token:
        inner.append(''.join(token).strip())
    return func_name if inner else None, inner


def _extract_hyperlink_target(cell_raw, wb_raw, wb_values, sheet_name: str):
    if cell_raw.hyperlink and getattr(cell_raw.hyperlink, 'target', None):
        return cell_raw.hyperlink.target

    raw_value = cell_raw.value
    if isinstance(raw_value, str):
        candidate = raw_value.lstrip('=').strip()
        if candidate:
            func_name, args = _parse_function(candidate)
            if func_name and func_name.upper() == 'HYPERLINK' and args:
                target_expr = args[0]
                evaluated = _evaluate_literal(target_expr, wb_raw, wb_values, sheet_name)
                if evaluated:
                    return evaluated
            # Direct literal fallback
            if candidate.lower().startswith(('http://', 'https://', '/')):
                return candidate

    return None

def generate_custom_filename(type_of_material: str, name: str, codex: str) -> str:
    """Generate a custom PDF filename from Excel data.
       If codex and type_of_material are missing, use only the cleaned name.
       Example: infografia_Flow3Ciclo3Tomás_ES-DSM-00864.pdf or just Tomás.pdf"""
    cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', str(name))
    
    if not type_of_material.strip() and not codex.strip():
        # Only name available
        if cleaned_name:
            return f"{cleaned_name}.pdf"
        else:
            logging.warning("Name was empty or contained only non-alphanumeric characters. Using 'unnamed.pdf'.")
            return "unnamed.pdf"
    else:
        # Original logic with all parts
        cleaned_type_of_material = str(type_of_material).rstrip('_')
        cleaned_codex = str(codex)
        
        filename_parts = [part for part in [cleaned_type_of_material, cleaned_name, cleaned_codex] if part]
        
        if not filename_parts:
            base_name = "default_filename"
            logging.warning("All parts for filename were empty or cleaned to empty. Using 'default_filename.pdf'.")
        else:
            base_name = "_".join(filename_parts)
        
        # Ensure the final base_name is not empty if parts joined to nothing (e.g. just underscores)
        if not base_name.strip('_'):
            base_name = "fallback_filename"
            logging.warning("Filename parts resulted in an empty or underscore-only name. Using 'fallback_filename.pdf'.")
        
        return f"{base_name}.pdf"

# --- Start: Helper Functions for Screenshot-to-PDF Conversion ---

def trim_image_whitespace(image_path: str, bg_color=(255, 255, 255)):
    """
    Trim whitespace from the bottom and sides of an image.

    Returns
    -------
    Tuple[str, int, int]
        (path_to_trimmed_image, left_offset_px, top_offset_px)
        If no trimming was required or an error occurred, the original image
        path is returned with offsets (0, 0) so that callers can still work
        with consistent return values.
    """
    try:
        img = Image.open(image_path).convert("RGB")
        bg = Image.new("RGB", img.size, bg_color)
        diff = ImageChops.difference(img, bg)
        bbox = diff.getbbox()
        if bbox:
            left, upper, right, lower = bbox
            trimmed_img = img.crop(bbox)
            trimmed_path = image_path.replace(".png", "_trimmed.png")
            trimmed_img.save(trimmed_path, "PNG")
            logging.info(f"Saved trimmed image to {trimmed_path}")
            return trimmed_path, left, upper
        else:
            logging.warning(f"Could not find content box in {image_path}. Returning original path without trimming.")
            return image_path, 0, 0
    except Exception as e:
        logging.error(f"Error trimming image {image_path}: {e}", exc_info=True)
        # Fallback to original image with zero offsets so pipeline can proceed
        return image_path, 0, 0

def convert_image_to_pdf(image_path: str, pdf_path: str):
    """
    Converts a single image file to a PDF.
    """
    try:
        with open(pdf_path, "wb") as f:
            f.write(img2pdf.convert(image_path))
        logging.info(f"Successfully converted {image_path} to {pdf_path}")
    except Exception as e:
        logging.error(f"Error converting image {image_path} to PDF: {e}", exc_info=True)
        raise

# --- End: Helper Functions ---

def remove_trailing_blank_page(pdf_path: str):
    """If the last page of the PDF has no text or XObjects, remove it."""
    try:
        reader = PdfReader(pdf_path)
        total_pages = len(reader.pages)
        if total_pages <= 1:
            return  # nothing to trim
        last_page = reader.pages[-1]
        has_text = False
        try:
            text = last_page.extract_text()
            if text and text.strip():
                has_text = True
        except Exception:
            pass
        has_xobjects = "/XObject" in last_page.get("/Resources", {})
        if not has_text and not has_xobjects:
            # Remove the last page
            writer = PdfWriter()
            for i in range(total_pages - 1):
                writer.add_page(reader.pages[i])
            with open(pdf_path, "wb") as f_out:
                writer.write(f_out)
            logging.info(f"Removed trailing blank page from {pdf_path}")
    except Exception as e:
        logging.warning(f"Could not post-process PDF {pdf_path} to remove blank page: {e}")

def capture_and_save_pdf(page: Page, url: str, output_dir: str, filename: str):
    """Capture full page as PDF, segmented into max 14000px high sections using pypdf."""
    try:
        # --- Start: Capture Browser Console Logs ---
        def log_console_message(msg):
            # Log messages, filtering out any potential noise if necessary
            logging.info(f"[Browser Console] {msg.type.upper()}: {msg.text}")
        
        page.on("console", log_console_message)
        # --- End: Capture Browser Console Logs ---

        page.goto(url, wait_until='load', timeout=60000)
        logging.info(f"Navigated to {url}. [Process/Worker]")

        # Set a very large viewport height to encourage full layout rendering
        large_viewport_height = 30000
        page.set_viewport_size({"width": DEFAULT_VIEWPORT_WIDTH, "height": large_viewport_height})
        logging.info(f"Set viewport to {DEFAULT_VIEWPORT_WIDTH}x{large_viewport_height} for {url}. [Process/Worker]")

        # --- Start: Hide Cookie Consent Icon (e.g., OneTrust) ---
        try:
            logging.info(f"Attempting to hide cookie consent icon for {url}. [Process/Worker]")
            # Common selectors for OneTrust. Adjust if necessary.
            selectors_to_hide = [
                '#onetrust-consent-sdk',
                'div[id^="onetrust-banner-sdk"]',
                '.onetrust-pc-dark-filter',
                '#ot-sdk-btn-floating'
            ]
            for selector in selectors_to_hide:
                page.evaluate(f"(selector) => {{ const el = document.querySelector(selector); if (el) el.style.display = 'none'; }}", selector)
            logging.info(f"Executed JS to hide cookie consent icon for {url}. [Process/Worker]")
        except Exception as e_hide_cookie:
            logging.warning(f"Could not hide cookie consent icon for {url}: {e_hide_cookie}. Interactions might be affected. [Process/Worker]")
        # --- End: Hide Cookie Consent Icon ---

        actual_page_height = None # Initialize to None
        # --- Start: Click Expandable Sections & Scroll ---
        page.wait_for_timeout(3000) # Increased delay for page to settle before interacting
        try:
            # --- Expand any 'Leer más' expandable content ---
            expand_buttons = page.locator('text=/Leer más/i')
            count = expand_buttons.count()
            if count > 0:
                logging.info(f"Found {count} 'Leer más' elements for {url}. Clicking them. [Process/Worker]")
                for i in range(count):
                    try:
                        button = expand_buttons.nth(i)
                        if button.is_visible():
                            button.click(timeout=5000)  # Increased timeout
                            page.wait_for_timeout(1000)  # Delay after click to allow DOM update
                    except Exception as click_err:
                        logging.warning(f"Could not click 'Leer más' element {i} on {url}: {click_err} [Process/Worker]")

            # --- Robustly expand accordion elements (may load lazily) ---
            try:
                # Wait briefly for any accordion icons to appear
                try:
                    page.wait_for_selector('span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown', timeout=5000)
                except Exception:
                    pass  # Not critical if none appear immediately

                max_rounds = 4  # Repeat to catch accordions that load after earlier ones expand
                for round_idx in range(max_rounds):
                    closed_selector = 'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                    closed_buttons = page.locator(closed_selector)
                    closed_count = closed_buttons.count()
                    if closed_count == 0:
                        break  # All accordions are open

                    logging.info(
                        f"Accordion expansion round {round_idx + 1}: "
                        f"{closed_count} closed accordions detected for {url}. [Process/Worker]"
                    )

                    for btn_idx in range(closed_count):
                        try:
                            btn = closed_buttons.nth(btn_idx)
                            if btn.is_visible():
                                btn.click(timeout=5000)
                                page.wait_for_timeout(600)  # Allow DOM to update
                        except Exception as acc_err:
                            logging.warning(
                                f"Could not click accordion element {btn_idx} on {url}: {acc_err} [Process/Worker]"
                            )

                    # Short pause before re-evaluating, to allow lazily loaded accordions to render
                    page.wait_for_timeout(800)

                # Final log of any remaining closed accordions
                remaining_closed = page.locator(
                    'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                ).count()
                logging.info(
                    f"Accordion expansion complete for {url}. Remaining closed accordions: {remaining_closed} [Process/Worker]"
                )
            except Exception as accordion_global_err:
                logging.error(
                    f"Error during accordion expansion on {url}: {accordion_global_err} [Process/Worker]"
                )
            
            logging.info(f"Scrolling page to trigger lazy loading for {url}... [Process/Worker]")
            total_page_height_js = """
            () => {
                return document.documentElement.scrollHeight;
            }
            """
            actual_page_height = page.evaluate(total_page_height_js)
            viewport_height = page.viewport_size['height'] if page.viewport_size else 768
            current_scroll = 0
            scroll_increment = viewport_height * 0.8
            max_scroll_iterations = 50
            iterations = 0
            while current_scroll < actual_page_height and iterations < max_scroll_iterations:
                page.mouse.wheel(0, scroll_increment)
                page.wait_for_timeout(500)
                current_scroll += scroll_increment
                new_total_page_height = page.evaluate(total_page_height_js)
                if new_total_page_height > actual_page_height:
                    actual_page_height = new_total_page_height
                elif page.evaluate("window.scrollY + window.innerHeight >= document.body.scrollHeight - 10"):
                    logging.info(f"Attempting to generate PDF with effective height: {actual_page_height}px for {url} [Process/Worker]")
                    break
                iterations += 1
            if iterations >= max_scroll_iterations:
                logging.warning(f"Max scroll iterations reached for {url}. Final height: {actual_page_height} [Process/Worker]")
            logging.info(f"Finished scrolling for {url}. Final evaluated height: {actual_page_height}px [Process/Worker]")

        except Exception as scroll_err:
            logging.error(f"Error during clicking/scrolling for {url}: {scroll_err} [Process/Worker]")
        # --- End: Click Expandable Sections & Scroll ---

        if actual_page_height is None:
            logging.error(f"Could not determine page height for {url}. Aborting PDF capture.")
            return None

        initial_page_height = actual_page_height # Store height before any potential changes
        logging.info(f"Initial page height for {url}: {initial_page_height}px [Process/Worker]")

        # --- All CSS and JS injection has been removed to simplify rendering. ---
        # We will rely on the browser's native print rendering.

        # Determine if the page is 'long' for logging/future conditional logic.
        # This currently has no effect on rendering as CSS/JS injection is disabled.
        is_long_page = initial_page_height > MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT
        logging.info(f"Page classification: {'Long' if is_long_page else 'Short'}. Note: This currently does not alter rendering. [Process/Worker]")

        # Print media emulation has been removed. initial_page_height is already set.
        # final_page_height will be determined after settling actions.

        # --- Settling actions before final height measurement ---
        logging.info(f"Performing final scroll and adding delays before final height measurement for {url}. [Process/Worker]")
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(2000) # Wait 2 seconds for explicit rendering time

        try:
            logging.info(f"Waiting for network idle for {url}. [Process/Worker]")
            page.wait_for_load_state('networkidle', timeout=10000) # Wait for network activity to cease
            logging.info(f"Network is idle for {url}. [Process/Worker]")
        except Exception as e_network_idle:
            logging.warning(f"Timeout or error waiting for network idle for {url}: {e_network_idle}. Proceeding with PDF capture. [Process/Worker]")

        # Re-evaluate page height AFTER all settling actions
        final_page_height = page.evaluate(total_page_height_js)
        logging.info(f"Final page height for {url} after all settling actions: {final_page_height}px [Process/Worker]")

        # Log if page height changed significantly from initial measurement
        if abs(final_page_height - initial_page_height) > 50:
            logging.info(f"Page height changed for {url} after settling actions: from {initial_page_height}px to {final_page_height}px. [Process/Worker]")
        
        if final_page_height <= 100: # Check for suspiciously small height
            logging.warning(f"Final page height for {url} is suspiciously small ({final_page_height}px). Using initial height as fallback if larger. [Process/Worker]")
            if initial_page_height > final_page_height: # Only use initial_page_height if it's actually larger
                 final_page_height = initial_page_height
                 logging.info(f"Reverted to initial height {initial_page_height}px due to small final height for {url}. [Process/Worker]")

        # Ensure there's a valid height to use for PDF generation.
        if final_page_height <= 0:
            logging.error(f"Final page height for {url} is zero or negative. Using initial height: {initial_page_height}px. [Process/Worker]")
            final_page_height = initial_page_height # Fallback to initial height
            if final_page_height <= 0: # Still zero, fallback to a default
                final_page_height = DEFAULT_FALLBACK_PAGE_HEIGHT_PX
                logging.error(f"Initial height also invalid. Using default fallback height: {DEFAULT_FALLBACK_PAGE_HEIGHT_PX}px for {url}. [Process/Worker]")

        # --- Start: Multi-tab Component Handling ---
        # Some pages include a multi-tab component (e.g. efficacy/safety tabs). By default only the first tab
        # would be captured. The following block detects such components and iterates through every tab so that
        # each one is captured in a separate PDF (filename appended with _tabN).
        try:
            tab_items_locator = page.locator("ul.mhh-mcn-v1-tabs-panel-nav__list li")
            tab_count = tab_items_locator.count()
        except Exception:
            tab_items_locator = None
            tab_count = 0

        if tab_count and tab_count > 1:
            logging.info(f"Detected multi-tab component with {tab_count} tabs for {url}. Capturing each tab separately. [Process/Worker]")
            base_name, ext = os.path.splitext(filename)
            captured_paths = []

            for i in range(tab_count):
                try:
                    # Click the tab (even tab 1 to ensure active state)
                    tab_el = tab_items_locator.nth(i)
                    try:
                        tab_el.scroll_into_view_if_needed(timeout=3000)
                    except Exception:
                        pass  # not critical
                    tab_click_target = tab_el.locator('a, button') if tab_el.locator('a, button').count() > 0 else tab_el
                    tab_click_target.click(timeout=5000)
                    page.wait_for_timeout(1000)  # allow content to load
                    # --- Additional expansion and lazy-load handling for this tab ---
                    try:
                        # Expand 'Leer más' expandable content within the tab
                        expand_buttons_tab = page.locator('text=/Leer más/i')
                        for idx_expand in range(expand_buttons_tab.count()):
                            try:
                                btn_tab = expand_buttons_tab.nth(idx_expand)
                                if btn_tab.is_visible():
                                    btn_tab.click(timeout=5000)
                                    page.wait_for_timeout(600)
                            except Exception as ex:
                                logging.warning(f"Could not click 'Leer más' element {idx_expand} on {url} (tab {i+1}): {ex} [Process/Worker]")

                        # Expand accordion dropdowns within the tab
                        max_rounds_tab = 3
                        for round_idx_tab in range(max_rounds_tab):
                            closed_selector_tab = 'button:has(span.mhh-mcn-v1-accordion-molecule-header__icon-dropdown)[aria-expanded="false"]'
                            closed_buttons_tab = page.locator(closed_selector_tab)
                            closed_count_tab = closed_buttons_tab.count()
                            if closed_count_tab == 0:
                                break  # All accordions are open in this tab
                            for btn_idx2 in range(closed_count_tab):
                                try:
                                    btn2 = closed_buttons_tab.nth(btn_idx2)
                                    if btn2.is_visible():
                                        btn2.click(timeout=5000)
                                        page.wait_for_timeout(400)  # Allow DOM update
                                except Exception as ex2:
                                    logging.warning(f"Could not click accordion element {btn_idx2} on {url} (tab {i+1}): {ex2} [Process/Worker]")

                        # Scroll through the tab content to trigger lazy-loaded images
                        viewport_height_tab = page.viewport_size['height'] if page.viewport_size else 768
                        current_scroll_tab = 0
                        scroll_increment_tab = int(viewport_height_tab * 0.8)
                        total_height_tab = page.evaluate("() => document.documentElement.scrollHeight")
                        max_iter_tab = 25
                        iter_tab = 0
                        while current_scroll_tab < total_height_tab and iter_tab < max_iter_tab:
                            page.mouse.wheel(0, scroll_increment_tab)
                            page.wait_for_timeout(400)
                            current_scroll_tab += scroll_increment_tab
                            total_height_tab = page.evaluate("() => document.documentElement.scrollHeight")
                            iter_tab += 1
                        page.wait_for_timeout(800)  # brief settle after scrolling
                    except Exception as tab_handling_err:
                        logging.warning(f"Error expanding/scrolling content on {url} (tab {i+1}): {tab_handling_err} [Process/Worker]")

                    # --- Capture link info BEFORE screenshot for annotation purposes ---
                    link_infos = []
                    try:
                        possible_selectors = ["a:has-text('Ficha Técnica')", "a:has-text('Ficha Tecnica')"]
                        for sel in possible_selectors:
                            elements = page.locator(sel)
                            cnt = elements.count()
                            for lidx in range(cnt):
                                el = elements.nth(lidx)
                                try:
                                    el.scroll_into_view_if_needed(timeout=3000)
                                except Exception:
                                    pass
                                bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                                href = el.get_attribute('href')
                                if bbox_abs and href:
                                    link_infos.append({"bbox": bbox_abs, "href": href})
                        # CIMA links
                        try:
                            elements = page.locator("a[href*='cima.aemps.es']")
                            cnt_href = elements.count()
                            for hidx in range(cnt_href):
                                el = elements.nth(hidx)
                                try:
                                    el.scroll_into_view_if_needed(timeout=3000)
                                except Exception:
                                    pass
                                bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                                href = el.get_attribute('href')
                                if bbox_abs and href:
                                    link_infos.append({"bbox": bbox_abs, "href": href})
                        except Exception as href_err:
                            logging.warning(f"Could not collect CIMA href link info for {url} (tab {i+1}): {href_err} [Process/Worker]")
                    except Exception as link_err:
                        logging.warning(f"Could not collect 'Ficha Técnica' link info for {url} (tab {i+1}): {link_err} [Process/Worker]")

                    variant_filename = f"{base_name}_tab{i+1}{ext}"
                    raw_screenshot_path = os.path.join(output_dir, f"RAW_SCREENSHOT_{variant_filename.replace('.pdf', '.png')}")
                    logging.info(f"Capturing screenshot for tab {i+1} – saving to {raw_screenshot_path}. [Process/Worker]")
                    page.screenshot(path=raw_screenshot_path, full_page=True)

                    # Trim and convert to PDF
                    trimmed_screenshot_path, offset_left, offset_top = trim_image_whitespace(raw_screenshot_path)
                    final_pdf_path = os.path.join(output_dir, variant_filename)
                    convert_image_to_pdf(trimmed_screenshot_path, final_pdf_path)

                    # --- Add annotations if link info present ---
                    try:
                        if link_infos:
                            img = Image.open(trimmed_screenshot_path)
                            img_w, img_h = img.size
                            reader = PdfReader(final_pdf_path)
                            writer = PdfWriter()
                            for p in reader.pages:
                                writer.add_page(p)
                            pdf_page = writer.pages[0]
                            pdf_w = float(pdf_page.mediabox.width)
                            pdf_h = float(pdf_page.mediabox.height)
                            rx = pdf_w / img_w
                            ry = pdf_h / img_h
                            for info in link_infos:
                                bbox = info['bbox']
                                href = info['href']
                                adj_x = (bbox['x'] - offset_left) * rx
                                adj_y_top = (bbox['y'] - offset_top) * ry
                                adj_w = bbox['width'] * rx
                                adj_h = bbox['height'] * ry
                                x_ll = adj_x
                                x_ur = adj_x + adj_w
                                y_ur = pdf_h - adj_y_top
                                y_ll = y_ur - adj_h
                                # ensure minimum clickable area 10x10pt
                                min_side = 10
                                if (x_ur - x_ll) < min_side:
                                    expand = (min_side - (x_ur - x_ll)) / 2
                                    x_ll -= expand
                                    x_ur += expand
                                if (y_ur - y_ll) < min_side:
                                    expand = (min_side - (y_ur - y_ll)) / 2
                                    y_ll -= expand
                                    y_ur += expand
                                x_ll = max(0, x_ll); y_ll = max(0, y_ll)
                                x_ur = min(pdf_w, x_ur); y_ur = min(pdf_h, y_ur)
                                rect = [int(x_ll), int(y_ll), int(x_ur), int(y_ur)]
                                try:
                                    if hasattr(writer, 'add_uri'):
                                        try:
                                            writer.add_uri(0, href, rect, border=[0,0,0])
                                        except TypeError:
                                            writer.add_uri(0, href, rect)
                                    else:
                                        if AnnotationBuilder:
                                            annot = AnnotationBuilder.uri(rect, href, border_width=0)
                                            writer.add_annotation(0, annot)
                                except Exception as uri_err:
                                    logging.warning(f"Failed to add URI annotation for {href} on {url} (tab {i+1}): {uri_err} [Process/Worker]")
                            with open(final_pdf_path, 'wb') as _out:
                                writer.write(_out)
                    except Exception as anno_err:
                        logging.warning(f"Failed to post-process PDF annotations for {url} (tab {i+1}): {anno_err} [Process/Worker]")

                    captured_paths.append(final_pdf_path)

                    # Clean temp images
                    try:
                        os.remove(raw_screenshot_path)
                        if os.path.exists(trimmed_screenshot_path) and trimmed_screenshot_path != raw_screenshot_path:
                            os.remove(trimmed_screenshot_path)
                    except Exception:
                        pass

                    logging.info(f"Successfully captured tab {i+1} for {url} as {variant_filename}. [Process/Worker]")
                except Exception as tab_err:
                    logging.warning(f"Failed to capture tab {i+1} for {url}: {tab_err} [Process/Worker]")

            # If at least one tab PDF was created, return the first one's path so upstream logic remains unchanged.
            if captured_paths:
                return captured_paths[0]
        # --- End: Multi-tab Component Handling ---
        
        # --- Start: PDF Generation (Single Full-Page) ---

        output_filename = os.path.join(output_dir, filename)
        try:
            # --- Start: Screenshot-to-PDF Generation ---
            # 1. Take the full-page screenshot
            logging.info(f"Waiting 5 seconds for final image rendering before screenshot for {url}. [Process/Worker]")
            page.wait_for_timeout(5000)  # Added delay for images to load

            # --- Capture 'Ficha Técnica' / 'Ficha Tecnica' link info before screenshot ---
            link_infos = []
            try:
                possible_selectors = ["a:has-text('Ficha Técnica')", "a:has-text('Ficha Tecnica')"]
                for sel in possible_selectors:
                    elements = page.locator(sel)
                    count_links = elements.count()
                    for idx in range(count_links):
                        el = elements.nth(idx)
                        try:
                            # Ensure the element is rendered and measurable
                            el.scroll_into_view_if_needed(timeout=3000)
                        except Exception:
                            pass
                        # Evaluate bounding box in **document** coordinates so later mapping is correct even if page is scrolled
                        bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                        href = el.get_attribute('href')
                        if bbox_abs and href:
                            link_infos.append({"bbox": bbox_abs, "href": href})

                # --- Also capture dose-specific links that point to the CIMA parent site ---
                try:
                    elements = page.locator("a[href*='cima.aemps.es']")
                    count_links_href = elements.count()
                    for idx in range(count_links_href):
                        el = elements.nth(idx)
                        try:
                            el.scroll_into_view_if_needed(timeout=3000)
                        except Exception:
                            pass
                        bbox_abs = el.evaluate("el => { const r = el.getBoundingClientRect(); return { x: r.left + window.scrollX, y: r.top + window.scrollY, width: r.width, height: r.height }; }")
                        href = el.get_attribute('href')
                        if bbox_abs and href:
                            link_infos.append({"bbox": bbox_abs, "href": href})
                except Exception as href_err:
                    logging.warning(f"Could not collect CIMA href link info for {url}: {href_err} [Process/Worker]")
            except Exception as link_err:
                logging.warning(f"Could not collect 'Ficha Técnica' link info for {url}: {link_err} [Process/Worker]")
            raw_screenshot_path = os.path.join(output_dir, f"RAW_SCREENSHOT_{filename.replace('.pdf', '.png')}")
            logging.info(f"Taking full-page screenshot for {url} and saving to {raw_screenshot_path}. [Process/Worker]")
            page.screenshot(path=raw_screenshot_path, full_page=True)

            # 2. Trim the whitespace from the screenshot
            logging.info(f"Trimming whitespace from {raw_screenshot_path}. [Process/Worker]")
            trimmed_screenshot_path, offset_left, offset_top = trim_image_whitespace(raw_screenshot_path)
            if not trimmed_screenshot_path:
                raise Exception("Failed to trim screenshot whitespace.")

            # 3. Convert the trimmed screenshot to PDF
            logging.info(f"Converting trimmed screenshot {trimmed_screenshot_path} to PDF at {output_filename}. [Process/Worker]")
            convert_image_to_pdf(trimmed_screenshot_path, output_filename)

            # --- Add clickable link annotation to PDF if link info available ---
            try:
                if link_infos:
                    img = Image.open(trimmed_screenshot_path)
                    img_width, img_height = img.size

                    reader = PdfReader(output_filename)
                    writer = PdfWriter()
                    for p in reader.pages:
                        writer.add_page(p)
                    pdf_page = writer.pages[0]
                    pdf_width = float(pdf_page.mediabox.width)
                    pdf_height = float(pdf_page.mediabox.height)
                    ratio_x = pdf_width / img_width
                    ratio_y = pdf_height / img_height

                    for info in link_infos:
                        bbox = info['bbox']
                        href = info['href']
                        adj_x = (bbox['x'] - offset_left) * ratio_x
                        adj_y_top = (bbox['y'] - offset_top) * ratio_y
                        adj_width = bbox['width'] * ratio_x
                        adj_height = bbox['height'] * ratio_y
                        x_ll = adj_x
                        x_ur = adj_x + adj_width
                        y_ur = pdf_height - adj_y_top
                        y_ll = y_ur - adj_height
                        # Ensure a minimum clickable area (10x10 points) to improve usability.
                        min_side = 10  # points
                        if (x_ur - x_ll) < min_side:
                            expand = (min_side - (x_ur - x_ll)) / 2
                            x_ll -= expand
                            x_ur += expand
                        if (y_ur - y_ll) < min_side:
                            expand = (min_side - (y_ur - y_ll)) / 2
                            y_ll -= expand
                            y_ur += expand
                        # Clamp to page boundaries
                        x_ll = max(0, x_ll)
                        y_ll = max(0, y_ll)
                        x_ur = min(pdf_width, x_ur)
                        y_ur = min(pdf_height, y_ur)
                        rect = [int(x_ll), int(y_ll), int(x_ur), int(y_ur)]
                        logging.info(f"Adding link annotation for {href} with rect {rect} on {url} [Process/Worker]")
                        try:
                            # Prefer built-in helper if available (pypdf >=5)
                            if hasattr(writer, 'add_uri'):
                                # Pass a zero-width border so the clickable rectangle has no visible outline.
                                try:
                                    writer.add_uri(0, href, rect, border=[0, 0, 0])
                                except TypeError:
                                    # Older pypdf versions may not accept the border kwarg – fall back without it.
                                    writer.add_uri(0, href, rect)
                            else:
                                if AnnotationBuilder:
                                    # Newer pypdf: specify border_width=0 so no border is drawn
                                    annot = AnnotationBuilder.uri(rect, href, border_width=0)
                                    writer.add_annotation(0, annot)
                                else:
                                    logging.warning("AnnotationBuilder not available in installed pypdf; URI annotation skipped for this link.")
                        except Exception as uri_err:
                            logging.warning(f"Failed to add URI annotation for {href} on {url}: {uri_err} [Process/Worker]")

                    with open(output_filename, 'wb') as _out:
                        writer.write(_out)
            except Exception as anno_err:
                logging.warning(f"Failed to post-process PDF annotations for {url}: {anno_err} [Process/Worker]")
            
            logging.info(f"Successfully created PDF from screenshot for {url} at {output_filename}. [Process/Worker]")

            # 4. Clean up temporary image files
            try:
                os.remove(raw_screenshot_path)
                if os.path.exists(trimmed_screenshot_path) and trimmed_screenshot_path != raw_screenshot_path:
                    os.remove(trimmed_screenshot_path)
                logging.info(f"Cleaned up temporary screenshots for {url}. [Process/Worker]")
            except Exception as e_cleanup:
                logging.warning(f"Could not clean up temporary screenshots for {url}: {e_cleanup}. [Process/Worker]")

            return output_filename
            # --- End: Screenshot-to-PDF Generation ---
        except Exception as e_ss_pdf:
            logging.error(f"Failed to generate PDF from screenshot for {url}: {e_ss_pdf}", exc_info=True)
            return None


    except Exception as e:
        logging.error(f"Error in capture_and_save_pdf for {url} [Process/Worker]: {e}", exc_info=True)
        # Clean up temp tall PDF if error occurred mid-process
        if 'temp_tall_pdf_path' in locals() and temp_tall_pdf_path and os.path.exists(temp_tall_pdf_path):
            try:
                os.remove(temp_tall_pdf_path)
            except Exception as e_remove_err:
                logging.warning(f"Could not remove temp tall PDF {temp_tall_pdf_path} during error cleanup: {e_remove_err} [Process/Worker]")
        return None

def process_worker_task(storage_state_path: str, url: str, output_dir: str, filename: str):
    """Worker function to run in a separate process. Returns url, path to PDF, and the filename used."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        logging.info(f"Process worker starting for URL: {url}")
        # Initialize Playwright within the process
        playwright = sync_playwright().start()
        # Launch browser within the process (headless is generally better for workers)
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        try:
            with open(storage_state_path, 'r') as f:
                storage_state = json.load(f)
        except Exception as load_err:
            logging.error(f"Process worker for {url} failed to load storage state from {storage_state_path}: {load_err}")
            raise # Re-raise the error to signify failure

        # Create context using the loaded state
        context = browser.new_context(
            storage_state=storage_state,
            viewport={"width": DEFAULT_VIEWPORT_WIDTH, "height": 1080}, # Use constant
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Call the capture function
        pdf_result_path = capture_and_save_pdf(page, url, output_dir, filename)
        
        # If PDF was created, attempt to remove trailing blank page
        if pdf_result_path:
            logging.info(f"Attempting to remove trailing blank page from: {pdf_result_path} [Process/Worker]")
            remove_trailing_blank_page(pdf_result_path)
        
        # Return the original URL, the path to the saved PDF (if successful), and the filename used.
        # This helps in mapping results back in the main process.
        return url, pdf_result_path, filename
    except Exception as e:
        logging.error(f"Error in process_worker_task for {url}: {e}", exc_info=True)
        return url, None, filename # Return URL, None for path, and filename for failure tracking
    finally:
        # Ensure cleanup within the process
        if page: 
            try: page.close() 
            except Exception as e: logging.debug(f"Error closing page for {url}: {e}")
        if context: 
            try: context.close() 
            except Exception as e: logging.debug(f"Error closing context for {url}: {e}")
        if browser: 
            try: browser.close() 
            except Exception as e: logging.debug(f"Error closing browser for {url}: {e}")
        if playwright: 
            try: playwright.stop() 
            except Exception as e: logging.debug(f"Error stopping playwright for {url}: {e}")
        logging.info(f"Process worker finished for URL: {url}")

def main():
    atexit.register(cleanup_temp_storage_file)
    st.title("MSD Portal Screenshot Capture")

    uploaded_file = st.file_uploader("Upload Excel file with URLs", type=['xlsx', 'xls'])

    if uploaded_file is not None:
        try:
            file_bytes = uploaded_file.getvalue()
            input_df = read_excel(io.BytesIO(file_bytes))
            if input_df.empty:
                st.error("Failed to read Excel file or it's empty.")
                return

            url_col = None
            name_col = None
            codex_col = None
            type_col = None
            ruta_col = None

            for idx, col in enumerate(input_df.columns):
                col_lower = str(col).lower().strip()
                if col_lower == 'url':
                    url_col = idx
                elif col_lower == 'name':
                    name_col = idx
                elif col_lower in ['codex', 'codex promomats']:
                    codex_col = idx
                elif col_lower in ['type of material', 'type']:
                    type_col = idx
                elif col_lower in ['ruta', 'path', 'url path', 'enlace', 'link']:
                    ruta_col = idx

            if url_col is None:
                st.error("Excel file must have a column named 'URL'")
                logging.error("Missing required column: 'URL'")
                return
            if name_col is None:
                st.error("Excel file must have a column named 'Name'")
                logging.error("Missing required column: 'Name'")
                return

            hyperlinks = {}
            url_col_name = input_df.columns[url_col]
            try:
                wb_raw = load_workbook(io.BytesIO(file_bytes), data_only=False)
                try:
                    wb_values = load_workbook(io.BytesIO(file_bytes), data_only=True)
                except Exception as value_err:
                    wb_values = None
                    logging.warning(f"Could not load workbook with data_only=True: {value_err}")

                ws_raw = wb_raw[wb_raw.sheetnames[0]]
                ws_values = wb_values[wb_values.sheetnames[0]] if wb_values else None
                sheet_name = ws_raw.title

                header_cells = list(ws_raw.iter_rows(min_row=1, max_row=1, values_only=False))[0]
                sheet_url_col_idx = None
                for c_idx, c in enumerate(header_cells, start=1):
                    if str(c.value).strip().lower() == str(url_col_name).strip().lower():
                        sheet_url_col_idx = c_idx
                        break

                if sheet_url_col_idx is None:
                    sheet_url_col_idx = url_col + 1
                    logging.warning(f"Could not locate 'URL' header; falling back to column index {sheet_url_col_idx} (1-based).")

                for row_idx in range(2, len(input_df) + 2):
                    cell_raw = ws_raw.cell(row=row_idx, column=sheet_url_col_idx)
                    target = None
                    try:
                        target = _extract_hyperlink_target(cell_raw, wb_raw, wb_values, sheet_name)
                    except Exception as e_cell:
                        logging.debug(f"Error parsing hyperlink at row {row_idx}: {e_cell}")

                    if not target and ws_values:
                        cell_val = ws_values.cell(row=row_idx, column=sheet_url_col_idx)
                        val_str = str(cell_val.value).strip() if cell_val.value is not None else ''
                        if val_str.lower().startswith(('http://', 'https://', '/')):
                            target = val_str

                    if target:
                        if not target.lower().startswith(('http://', 'https://')):
                            target = urljoin(DEFAULT_BASE_URL, target)
                        hyperlinks[row_idx - 2] = target
                logging.info(f"Total hyperlinks found in first sheet: {len(hyperlinks)}")
            except Exception as e:
                logging.warning(f"Could not extract hyperlinks from Excel file: {e}")

            input_df['PDF Final Name'] = ""
            input_df['Capture Status'] = "Pending"
            input_df['Resolved URL'] = ""

            tasks_to_submit = []
            for index, row in input_df.iterrows():
                try:
                    type_of_material = str(row.iloc[type_col]) if type_col is not None and pd.notna(row.iloc[type_col]) else ""
                    name = str(row.iloc[name_col]) if pd.notna(row.iloc[name_col]) else ""
                    codex = str(row.iloc[codex_col]) if codex_col is not None and pd.notna(row.iloc[codex_col]) else ""
                    url = str(row.iloc[url_col]) if pd.notna(row.iloc[url_col]) else ""
                    ruta_val = str(row.iloc[ruta_col]) if ruta_col is not None and pd.notna(row.iloc[ruta_col]) else ""

                    logging.info(f"Row {index}: URL display value: '{url}', Ruta: '{ruta_val}'")

                    if url and not url.lower().startswith(('http://', 'https://')):
                        if '/' in url or url.startswith('/'):
                            resolved = urljoin(DEFAULT_BASE_URL, url)
                            if resolved.lower().startswith(('http://', 'https://')):
                                url = resolved
                                logging.info(f"Row {index}: Resolved relative URL from cell to {url}")

                    if (not url or not url.lower().startswith(('http://', 'https://'))) and index in hyperlinks:
                        url = hyperlinks[index]
                        logging.info(f"Row {index}: Using extracted hyperlink target: {url}")

                    if (not url or not url.lower().startswith(('http://', 'https://'))) and ruta_val:
                        candidate = ruta_val.strip()
                        candidate = urljoin(DEFAULT_BASE_URL, candidate) if not candidate.lower().startswith(('http://', 'https://')) else candidate
                        if candidate.lower().startswith(('http://', 'https://')):
                            url = candidate
                            logging.info(f"Row {index}: Using Ruta fallback: {url}")

                    if not url.strip() or not url.lower().startswith(('http://', 'https://')):
                        logging.warning(f"Skipping row {index + 2} (Excel row number) due to invalid or empty URL: '{url}'")
                        input_df.loc[index, 'PDF Final Name'] = "SKIPPED_INVALID_URL"
                        input_df.loc[index, 'Capture Status'] = "Skipped (Invalid URL)"
                        input_df.loc[index, 'Resolved URL'] = ""
                        continue

                    pdf_final_name = generate_custom_filename(type_of_material, name, codex)
                    input_df.loc[index, 'PDF Final Name'] = pdf_final_name
                    input_df.loc[index, 'Resolved URL'] = url
                    tasks_to_submit.append({'url': url, 'filename': pdf_final_name, 'original_index': index})

                except IndexError:
                    err_msg = f"Error processing row {index + 2} in Excel. Column index out of range for available columns."
                    st.warning(err_msg)
                    logging.error(f"{err_msg} Available columns: {input_df.columns.tolist()}.")
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_STRUCTURE"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
                except Exception as e:
                    err_msg = f"Unexpected error processing row {index + 2} for filename generation: {e}"
                    st.warning(err_msg)
                    logging.error(err_msg, exc_info=True)
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_UNKNOWN"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue

            urls_to_process_count = len(tasks_to_submit)
            if urls_to_process_count == 0:
                st.info("No valid URLs found to process after initial parsing.")
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        worksheet = writer.sheets['Report']
                        for i, col_name in enumerate(input_df.columns):
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2
                            worksheet.set_column(i, i, column_width)
                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Initial Report (No Captures Done)",
                        data=excel_data,
                        file_name="screenshot_report_initial.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                return

            total_urls = urls_to_process_count
            username = st.text_input("Username")
            password = st.text_input("Password", type="password")
            login_url = "https://profesionales.msd.es/login"

            if st.button("Start Capture"):
                if not username or not password:
                    st.error("Please enter username and password")
                    return

                storage_state_path = login_to_portal(username, password, login_url)
                if storage_state_path is None:
                    st.error("Login failed. Cannot proceed.")
                    return

                timestamp = time.strftime("%Y%m%d-%H%M%S")
                pdf_dir = os.path.join("pdf_outputs", timestamp)
                os.makedirs(pdf_dir, exist_ok=True)
                st.success(f"PDFs will be saved in: {os.path.abspath(pdf_dir)}")

                progress_bar = st.progress(0.0)
                status_text = st.empty()
                processed_count = 0
                successful_captures = 0

                max_processes = min(os.cpu_count() or 1, total_urls)
                logging.info(f"Using {max_processes} worker processes for {total_urls} tasks.")

                with ProcessPoolExecutor(max_workers=max_processes) as executor:
                    futures = {
                        executor.submit(process_worker_task, storage_state_path, task['url'], pdf_dir, task['filename']): task
                        for task in tasks_to_submit
                    }

                    status_text.text(f"Submitted {total_urls} URLs to {max_processes} processes. Processing...")

                    for future in as_completed(futures):
                        task_info = futures[future]
                        original_index = task_info['original_index']
                        original_url = task_info['url']
                        submitted_filename = task_info['filename']

                        try:
                            res_url, pdf_result_path, returned_filename = future.result()

                            if pdf_result_path and returned_filename == submitted_filename:
                                successful_captures += 1
                                input_df.loc[original_index, 'Capture Status'] = 'Success'
                                logging.info(f"Successfully processed URL: {res_url} -> PDF: {pdf_result_path} (Filename: {returned_filename})")
                            elif pdf_result_path and returned_filename != submitted_filename:
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Filename Mismatch)'
                                logging.warning(f"Worker returned a PDF but filename mismatched for URL: {original_url}. Expected: {submitted_filename}, Got: {returned_filename}")
                            else:
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Worker Returned No PDF)'
                                logging.warning(f"Worker failed to return PDF for URL: {original_url} (Filename: {submitted_filename})")

                        except Exception as exc:
                            logging.error(f"URL {original_url} (Filename: {submitted_filename}) generated an exception in worker process: {exc}", exc_info=True)
                            input_df.loc[original_index, 'Capture Status'] = 'Error (Worker Exception)'
                        finally:
                            processed_count += 1
                            progress = processed_count / total_urls
                            progress_bar.progress(progress)
                            status_text.text(f"Processed {processed_count}/{total_urls} URLs... Success: {successful_captures}")

                status_text.text(f"Processing complete. Processed {processed_count}/{total_urls}.")
                st.success(f"Completed! Successfully captured {successful_captures} out of {total_urls} pages")

                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        worksheet = writer.sheets['Report']
                        for i, col_name in enumerate(input_df.columns):
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2
                            worksheet.set_column(i, i, column_width)

                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Report",
                        data=excel_data,
                        file_name="screenshot_report_final.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                else:
                    st.info("No data to generate a report.")

        except Exception as e:
            st.error(f"A critical error occurred in the main process: {e}")
            logging.error(f"Application error: {e}", exc_info=True)
        finally:
            logging.info("Main process try-finally block finished.")

if __name__ == "__main__":
    # Need this guard for ProcessPoolExecutor on some platforms (like Windows)
    main()
