import streamlit as st
import pandas as pd
from playwright.sync_api import sync_playwright, Page, Browser, Playwright
import time
import logging
import os
from PIL import Image, ImageChops
import img2pdf
import io
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import tempfile
from pypdf import PdfReader, PdfWriter
import copy
import math
import shutil
import atexit

# Setup logging
logging.basicConfig(
    filename='screenshot_capture.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

# --- Global Constants ---
MAX_PDF_HEIGHT_PX = 14000  # Max height for a single PDF page segment in pixels
MAX_PDF_HEIGHT_PT = MAX_PDF_HEIGHT_PX * 72 / 96  # Convert to points (assuming 96 DPI)
MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT = 13900 # Pages taller than this (in px) trigger minimal cleanup
DEFAULT_VIEWPORT_WIDTH = 1920 # Default viewport width in pixels for PDF capture
DEFAULT_FALLBACK_PAGE_HEIGHT_PX = 1080 # Fallback page height if everything else fails
# --- End Global Constants ---

def cleanup_temp_storage_file():
    """Clean up the temporary storage state file if it exists."""
    file_path = st.session_state.get('temp_storage_file_to_delete')
    if file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
            logging.info(f"ATEIXT: Removed temporary storage state file: {file_path}")
            del st.session_state.temp_storage_file_to_delete # Prevent re-attempts
        except OSError as e:
            logging.error(f"ATEIXT: Error removing temporary storage state file {file_path}: {e}")
    elif file_path:
        logging.info(f"ATEIXT: Temporary storage state file {file_path} not found, skipping removal.")

def initialize_playwright():
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    # Launch browser (headed mode still active for debugging, change to True for production)
    browser = playwright.chromium.launch(headless=False) 
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username: str, password: str, login_url: str) -> str | None:
    """Login to the portal using Playwright, save storage state to a temp file, and return the file path."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        st.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        # Use headless=True for the initial login for efficiency, unless debugging login itself
        browser = playwright.chromium.launch(headless=True) 
        context = browser.new_context(
            viewport={"width": 1920, "height": 1080},
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        st.info("Navigating to login page...")
        page.goto(login_url)
        logging.info("Navigated to login page")
        time.sleep(1)
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        st.info("Entering username...")
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        st.info("Proceeding to password step...")
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        st.info("Entering credentials...")
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        st.info("Submitting login...")
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        st.info("Waiting for login confirmation...")
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        st.success("Login successful! Session state captured.")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
            st.session_state.temp_storage_file_to_delete = storage_state_path # Store for atexit cleanup
        
        return storage_state_path # Return the path

    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        st.error(f"Login failed: {e}")
        return None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file) -> pd.DataFrame:
    """Read all data from Excel file, using the first row as header."""
    try:
        # Use the first row as header, ensure it's treated as such.
        df = pd.read_excel(file, header=0)
        logging.info(f"Read Excel file. Shape: {df.shape}. Columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}")
        st.error(f"Error reading Excel file: {e}")
        return pd.DataFrame() # Return empty DataFrame on error to be checked by caller

def generate_custom_filename(type_of_material: str, name: str, codex: str) -> str:
    """Generate a custom PDF filename from Excel data.
       Example: infografia_Flow3Ciclo3Tomás_ES-DSM-00864.pdf"""
    # Clean type_of_material: remove trailing '_'
    cleaned_type_of_material = str(type_of_material).rstrip('_')
    
    # Clean name: remove all non-alphanumeric characters (keeps letters and numbers)
    cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', str(name))
    
    # Codex is used as is, ensure it's a string
    cleaned_codex = str(codex)
    
    # Concatenate parts, filtering out any empty parts that might result from cleaning or empty inputs
    filename_parts = [part for part in [cleaned_type_of_material, cleaned_name, cleaned_codex] if part]
    
    if not filename_parts: # If all parts are empty
        base_name = "default_filename"
        logging.warning("All parts for filename were empty or cleaned to empty. Using 'default_filename.pdf'.")
    else:
        base_name = "_".join(filename_parts)
    
    # Ensure the final base_name is not empty if parts joined to nothing (e.g. just underscores)
    if not base_name.strip('_'):
        base_name = "fallback_filename"
        logging.warning("Filename parts resulted in an empty or underscore-only name. Using 'fallback_filename.pdf'.")

    return f"{base_name}.pdf"

# --- Start: Helper Functions for Screenshot-to-PDF Conversion ---

def trim_image_whitespace(image_path: str, bg_color=(255, 255, 255)) -> str:
    """
    Trims whitespace from the bottom and sides of an image.
    Returns the path to the trimmed image.
    """
    try:
        img = Image.open(image_path).convert("RGB")
        bg = Image.new("RGB", img.size, bg_color)
        diff = ImageChops.difference(img, bg)
        bbox = diff.getbbox()
        if bbox:
            trimmed_img = img.crop(bbox)
            trimmed_path = image_path.replace(".png", "_trimmed.png")
            trimmed_img.save(trimmed_path, "PNG")
            logging.info(f"Saved trimmed image to {trimmed_path}")
            return trimmed_path
        else:
            logging.warning(f"Could not find content box in {image_path}. Returning original path.")
            return image_path
    except Exception as e:
        logging.error(f"Error trimming image {image_path}: {e}", exc_info=True)
        return None

def convert_image_to_pdf(image_path: str, pdf_path: str):
    """
    Converts a single image file to a PDF.
    """
    try:
        with open(pdf_path, "wb") as f:
            f.write(img2pdf.convert(image_path))
        logging.info(f"Successfully converted {image_path} to {pdf_path}")
    except Exception as e:
        logging.error(f"Error converting image {image_path} to PDF: {e}", exc_info=True)
        raise

# --- End: Helper Functions ---

def remove_trailing_blank_page(pdf_path: str):
    """If the last page of the PDF has no text or XObjects, remove it."""
    try:
        reader = PdfReader(pdf_path)
        total_pages = len(reader.pages)
        if total_pages <= 1:
            return  # nothing to trim
        last_page = reader.pages[-1]
        has_text = False
        try:
            text = last_page.extract_text()
            if text and text.strip():
                has_text = True
        except Exception:
            pass
        has_xobjects = "/XObject" in last_page.get("/Resources", {})
        if not has_text and not has_xobjects:
            # Remove the last page
            writer = PdfWriter()
            for i in range(total_pages - 1):
                writer.add_page(reader.pages[i])
            with open(pdf_path, "wb") as f_out:
                writer.write(f_out)
            logging.info(f"Removed trailing blank page from {pdf_path}")
    except Exception as e:
        logging.warning(f"Could not post-process PDF {pdf_path} to remove blank page: {e}")

def capture_and_save_pdf(page: Page, url: str, output_dir: str, filename: str):
    """Capture full page as PDF, segmented into max 14000px high sections using pypdf."""
    try:
        # --- Start: Capture Browser Console Logs ---
        def log_console_message(msg):
            # Log messages, filtering out any potential noise if necessary
            logging.info(f"[Browser Console] {msg.type.upper()}: {msg.text}")
        
        page.on("console", log_console_message)
        # --- End: Capture Browser Console Logs ---

        page.goto(url, wait_until='load', timeout=60000)
        logging.info(f"Navigated to {url}. [Process/Worker]")

        # Set a very large viewport height to encourage full layout rendering
        large_viewport_height = 30000
        page.set_viewport_size({"width": DEFAULT_VIEWPORT_WIDTH, "height": large_viewport_height})
        logging.info(f"Set viewport to {DEFAULT_VIEWPORT_WIDTH}x{large_viewport_height} for {url}. [Process/Worker]")

        # --- Start: Hide Cookie Consent Icon (e.g., OneTrust) ---
        try:
            logging.info(f"Attempting to hide cookie consent icon for {url}. [Process/Worker]")
            # Common selectors for OneTrust. Adjust if necessary.
            selectors_to_hide = [
                '#onetrust-consent-sdk',
                'div[id^="onetrust-banner-sdk"]',
                '.onetrust-pc-dark-filter',
                '#ot-sdk-btn-floating'
            ]
            for selector in selectors_to_hide:
                page.evaluate(f"(selector) => {{ const el = document.querySelector(selector); if (el) el.style.display = 'none'; }}", selector)
            logging.info(f"Executed JS to hide cookie consent icon for {url}. [Process/Worker]")
        except Exception as e_hide_cookie:
            logging.warning(f"Could not hide cookie consent icon for {url}: {e_hide_cookie}. Interactions might be affected. [Process/Worker]")
        # --- End: Hide Cookie Consent Icon ---

        actual_page_height = None # Initialize to None
        # --- Start: Click Expandable Sections & Scroll ---
        page.wait_for_timeout(3000) # Increased delay for page to settle before interacting
        try:
            expand_buttons = page.locator('text=/Leer más/i')
            count = expand_buttons.count()
            if count > 0:
                logging.info(f"Found {count} 'Leer más' elements for {url}. Clicking them. [Process/Worker]")
                for i in range(count):
                    try:
                        button = expand_buttons.nth(i)
                        if button.is_visible():
                            button.click(timeout=5000) # Increased timeout
                            page.wait_for_timeout(1000) # Increased delay after click
                    except Exception as click_err:
                        logging.warning(f"Could not click 'Leer más' element {i} on {url}: {click_err} [Process/Worker]")
            
            logging.info(f"Scrolling page to trigger lazy loading for {url}... [Process/Worker]")
            total_page_height_js = """
            () => {
                return document.documentElement.scrollHeight;
            }
            """
            actual_page_height = page.evaluate(total_page_height_js)
            viewport_height = page.viewport_size['height'] if page.viewport_size else 768
            current_scroll = 0
            scroll_increment = viewport_height * 0.8
            max_scroll_iterations = 50
            iterations = 0
            while current_scroll < actual_page_height and iterations < max_scroll_iterations:
                page.mouse.wheel(0, scroll_increment)
                page.wait_for_timeout(500)
                current_scroll += scroll_increment
                new_total_page_height = page.evaluate(total_page_height_js)
                if new_total_page_height > actual_page_height:
                    actual_page_height = new_total_page_height
                elif page.evaluate("window.scrollY + window.innerHeight >= document.body.scrollHeight - 10"):
                    logging.info(f"Attempting to generate PDF with effective height: {actual_page_height}px for {url} [Process/Worker]")
                    break
                iterations += 1
            if iterations >= max_scroll_iterations:
                logging.warning(f"Max scroll iterations reached for {url}. Final height: {actual_page_height} [Process/Worker]")
            logging.info(f"Finished scrolling for {url}. Final evaluated height: {actual_page_height}px [Process/Worker]")

        except Exception as scroll_err:
            logging.error(f"Error during clicking/scrolling for {url}: {scroll_err} [Process/Worker]")
        # --- End: Click Expandable Sections & Scroll ---

        if actual_page_height is None:
            logging.error(f"Could not determine page height for {url}. Aborting PDF capture.")
            return None

        initial_page_height = actual_page_height # Store height before any potential changes
        logging.info(f"Initial page height for {url}: {initial_page_height}px [Process/Worker]")

        # --- All CSS and JS injection has been removed to simplify rendering. ---
        # We will rely on the browser's native print rendering.

        # Determine if the page is 'long' for logging/future conditional logic.
        # This currently has no effect on rendering as CSS/JS injection is disabled.
        is_long_page = initial_page_height > MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT
        logging.info(f"Page classification: {'Long' if is_long_page else 'Short'}. Note: This currently does not alter rendering. [Process/Worker]")

        # Print media emulation has been removed. initial_page_height is already set.
        # final_page_height will be determined after settling actions.

        # --- Settling actions before final height measurement ---
        logging.info(f"Performing final scroll and adding delays before final height measurement for {url}. [Process/Worker]")
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(2000) # Wait 2 seconds for explicit rendering time

        try:
            logging.info(f"Waiting for network idle for {url}. [Process/Worker]")
            page.wait_for_load_state('networkidle', timeout=10000) # Wait for network activity to cease
            logging.info(f"Network is idle for {url}. [Process/Worker]")
        except Exception as e_network_idle:
            logging.warning(f"Timeout or error waiting for network idle for {url}: {e_network_idle}. Proceeding with PDF capture. [Process/Worker]")

        # Re-evaluate page height AFTER all settling actions
        final_page_height = page.evaluate(total_page_height_js)
        logging.info(f"Final page height for {url} after all settling actions: {final_page_height}px [Process/Worker]")

        # Log if page height changed significantly from initial measurement
        if abs(final_page_height - initial_page_height) > 50:
            logging.info(f"Page height changed for {url} after settling actions: from {initial_page_height}px to {final_page_height}px. [Process/Worker]")
        
        if final_page_height <= 100: # Check for suspiciously small height
            logging.warning(f"Final page height for {url} is suspiciously small ({final_page_height}px). Using initial height as fallback if larger. [Process/Worker]")
            if initial_page_height > final_page_height: # Only use initial_page_height if it's actually larger
                 final_page_height = initial_page_height
                 logging.info(f"Reverted to initial height {initial_page_height}px due to small final height for {url}. [Process/Worker]")

        # Ensure there's a valid height to use for PDF generation.
        if final_page_height <= 0:
            logging.error(f"Final page height for {url} is zero or negative. Using initial height: {initial_page_height}px. [Process/Worker]")
            final_page_height = initial_page_height # Fallback to initial height
            if final_page_height <= 0: # Still zero, fallback to a default
                final_page_height = DEFAULT_FALLBACK_PAGE_HEIGHT_PX
                logging.error(f"Initial height also invalid. Using default fallback height: {DEFAULT_FALLBACK_PAGE_HEIGHT_PX}px for {url}. [Process/Worker]")
        
        # --- Start: PDF Generation (Single Full-Page) ---

        output_filename = os.path.join(output_dir, filename)
        try:
            # --- Start: Screenshot-to-PDF Generation ---
            # 1. Take the full-page screenshot
            logging.info(f"Waiting 5 seconds for final image rendering before screenshot for {url}. [Process/Worker]")
            page.wait_for_timeout(5000) # Added delay for images to load
            raw_screenshot_path = os.path.join(output_dir, f"RAW_SCREENSHOT_{filename.replace('.pdf', '.png')}")
            logging.info(f"Taking full-page screenshot for {url} and saving to {raw_screenshot_path}. [Process/Worker]")
            page.screenshot(path=raw_screenshot_path, full_page=True)

            # 2. Trim the whitespace from the screenshot
            logging.info(f"Trimming whitespace from {raw_screenshot_path}. [Process/Worker]")
            trimmed_screenshot_path = trim_image_whitespace(raw_screenshot_path)
            if not trimmed_screenshot_path:
                raise Exception("Failed to trim screenshot whitespace.")

            # 3. Convert the trimmed screenshot to PDF
            logging.info(f"Converting trimmed screenshot {trimmed_screenshot_path} to PDF at {output_filename}. [Process/Worker]")
            convert_image_to_pdf(trimmed_screenshot_path, output_filename)
            
            logging.info(f"Successfully created PDF from screenshot for {url} at {output_filename}. [Process/Worker]")

            # 4. Clean up temporary image files
            try:
                os.remove(raw_screenshot_path)
                if os.path.exists(trimmed_screenshot_path) and trimmed_screenshot_path != raw_screenshot_path:
                    os.remove(trimmed_screenshot_path)
                logging.info(f"Cleaned up temporary screenshots for {url}. [Process/Worker]")
            except Exception as e_cleanup:
                logging.warning(f"Could not clean up temporary screenshots for {url}: {e_cleanup}. [Process/Worker]")

            return output_filename
            # --- End: Screenshot-to-PDF Generation ---
        except Exception as e_ss_pdf:
            logging.error(f"Failed to generate PDF from screenshot for {url}: {e_ss_pdf}", exc_info=True)
            return None


    except Exception as e:
        logging.error(f"Error in capture_and_save_pdf for {url} [Process/Worker]: {e}", exc_info=True)
        # Clean up temp tall PDF if error occurred mid-process
        if 'temp_tall_pdf_path' in locals() and temp_tall_pdf_path and os.path.exists(temp_tall_pdf_path):
            try:
                os.remove(temp_tall_pdf_path)
            except Exception as e_remove_err:
                logging.warning(f"Could not remove temp tall PDF {temp_tall_pdf_path} during error cleanup: {e_remove_err} [Process/Worker]")
        return None

def process_worker_task(storage_state_path: str, url: str, output_dir: str, filename: str):
    """Worker function to run in a separate process. Returns url, path to PDF, and the filename used."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        logging.info(f"Process worker starting for URL: {url}")
        # Initialize Playwright within the process
        playwright = sync_playwright().start()
        # Launch browser within the process (headless is generally better for workers)
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        try:
            with open(storage_state_path, 'r') as f:
                storage_state = json.load(f)
        except Exception as load_err:
            logging.error(f"Process worker for {url} failed to load storage state from {storage_state_path}: {load_err}")
            raise # Re-raise the error to signify failure

        # Create context using the loaded state
        context = browser.new_context(
            storage_state=storage_state,
            viewport={"width": DEFAULT_VIEWPORT_WIDTH, "height": 1080}, # Use constant
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Call the capture function
        pdf_result_path = capture_and_save_pdf(page, url, output_dir, filename)
        
        # If PDF was created, attempt to remove trailing blank page
        if pdf_result_path:
            logging.info(f"Attempting to remove trailing blank page from: {pdf_result_path} [Process/Worker]")
            remove_trailing_blank_page(pdf_result_path)
        
        # Return the original URL, the path to the saved PDF (if successful), and the filename used.
        # This helps in mapping results back in the main process.
        return url, pdf_result_path, filename
    except Exception as e:
        logging.error(f"Error in process_worker_task for {url}: {e}", exc_info=True)
        return url, None, filename # Return URL, None for path, and filename for failure tracking
    finally:
        # Ensure cleanup within the process
        if page: 
            try: page.close() 
            except Exception as e: logging.debug(f"Error closing page for {url}: {e}")
        if context: 
            try: context.close() 
            except Exception as e: logging.debug(f"Error closing context for {url}: {e}")
        if browser: 
            try: browser.close() 
            except Exception as e: logging.debug(f"Error closing browser for {url}: {e}")
        if playwright: 
            try: playwright.stop() 
            except Exception as e: logging.debug(f"Error stopping playwright for {url}: {e}")
        logging.info(f"Process worker finished for URL: {url}")

def main():
    atexit.register(cleanup_temp_storage_file) # Register cleanup function
    storage_state_path = None  # Initialize to None
    st.title("MSD Portal Screenshot Capture")
    
    # File upload
    uploaded_file = st.file_uploader("Upload Excel file with URLs", type=['xlsx', 'xls'])
    
    if uploaded_file is not None:
        try:
            input_df = read_excel(uploaded_file)
            if input_df.empty:
                st.error("Failed to read Excel file or it's empty.")
                return

            # Define 0-indexed column indices for data extraction
            # Col B: Type of Material, Col C: Name, Col D: Codex, Col F: URL
            TYPE_COL_IDX = 1 
            NAME_COL_IDX = 2
            CODEX_COL_IDX = 3
            URL_COL_IDX = 5

            # Verify required columns exist by checking indices against df.columns length
            max_required_idx = max(TYPE_COL_IDX, NAME_COL_IDX, CODEX_COL_IDX, URL_COL_IDX)
            if max_required_idx >= len(input_df.columns):
                st.error(f"Excel file is missing required columns. It has {len(input_df.columns)} columns, but needs at least {max_required_idx + 1} for all data (URL, Name, Type, Codex). Please check columns B, C, D, and F.")
                logging.error(f"Excel column check failed. Found columns: {input_df.columns.tolist()}. Required max index: {max_required_idx}")
                return

            input_df['PDF Final Name'] = ""
            input_df['Capture Status'] = "Pending"
            
            tasks_to_submit = []
            for index, row in input_df.iterrows():
                try:
                    # Safely access data using iloc for position-based access
                    type_of_material = str(row.iloc[TYPE_COL_IDX]) if pd.notna(row.iloc[TYPE_COL_IDX]) else ""
                    name = str(row.iloc[NAME_COL_IDX]) if pd.notna(row.iloc[NAME_COL_IDX]) else ""
                    codex = str(row.iloc[CODEX_COL_IDX]) if pd.notna(row.iloc[CODEX_COL_IDX]) else ""
                    url = str(row.iloc[URL_COL_IDX]) if pd.notna(row.iloc[URL_COL_IDX]) else ""

                    if not url.strip() or not url.lower().startswith(('http://', 'https://')):
                        logging.warning(f"Skipping row {index + 2} (Excel row number) due to invalid or empty URL: '{url}'")
                        input_df.loc[index, 'PDF Final Name'] = "SKIPPED_INVALID_URL"
                        input_df.loc[index, 'Capture Status'] = "Skipped (Invalid URL)"
                        continue
                    
                    pdf_final_name = generate_custom_filename(type_of_material, name, codex)
                    input_df.loc[index, 'PDF Final Name'] = pdf_final_name
                    tasks_to_submit.append({'url': url, 'filename': pdf_final_name, 'original_index': index})
                
                except IndexError:
                    err_msg = f"Error processing row {index + 2} in Excel. Not enough columns or incorrect structure."
                    st.warning(err_msg) # Use warning for individual row errors to not halt all
                    logging.error(f"{err_msg} Available columns in row: {len(row)} based on df structure: {len(input_df.columns)}.")
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_STRUCTURE"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
                except Exception as e:
                    err_msg = f"Unexpected error processing row {index + 2} for filename generation: {e}"
                    st.warning(err_msg)
                    logging.error(err_msg, exc_info=True)
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_UNKNOWN"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
            
            urls_to_process_count = len(tasks_to_submit)
            if urls_to_process_count == 0:
                st.info("No valid URLs found to process after initial parsing.")
                # Still provide the modified Excel for download if rows were skipped/errored
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        # Adjust column widths as needed, example:
                        worksheet = writer.sheets['Report']
                        for i, col_name in enumerate(input_df.columns):
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2
                            worksheet.set_column(i, i, column_width)
                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Initial Report (No Captures Done)",
                        data=excel_data,
                        file_name="screenshot_report_initial.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                return

            total_urls = urls_to_process_count # This is the count of tasks actually submitted

            # Login credentials
            username = st.text_input("Username")
            password = st.text_input("Password", type="password")
            login_url = "https://profesionales.msd.es/login"
            
            # storage_state_path is initialized at line 682. Redundant initialization removed.

            if st.button("Start Capture"):
                if not username or not password:
                    st.error("Please enter username and password")
                    return
                
                # Login (once) and get path to storage state file
                storage_state_path = login_to_portal(username, password, login_url)
                if storage_state_path is None:
                    st.error("Login failed. Cannot proceed.")
                    return # Exit if login failed
                
                # Create a unique directory for this batch of PDFs - MOVED HERE
                timestamp = time.strftime("%Y%m%d-%H%M%S")
                pdf_dir = os.path.join("pdf_outputs", timestamp)
                os.makedirs(pdf_dir, exist_ok=True)
                st.success(f"PDFs will be saved in: {os.path.abspath(pdf_dir)}")
                
                progress_bar = st.progress(0.0)
                status_text = st.empty()
                processed_count = 0
                successful_captures = 0

                max_processes = min(os.cpu_count() or 1, total_urls)
                logging.info(f"Using {max_processes} worker processes for {total_urls} tasks.")

                with ProcessPoolExecutor(max_workers=max_processes) as executor:
                    futures = {
                        executor.submit(process_worker_task, storage_state_path, task['url'], pdf_dir, task['filename']): task
                        for task in tasks_to_submit
                    }
                    
                    status_text.text(f"Submitted {total_urls} URLs to {max_processes} processes. Processing...")

                    for future in as_completed(futures):
                        task_info = futures[future]
                        original_index = task_info['original_index']
                        original_url = task_info['url']
                        submitted_filename = task_info['filename'] # This is the 'PDF Final Name'

                        try:
                            # Result from process_worker_task is (url, pdf_result_path, returned_filename)
                            res_url, pdf_result_path, returned_filename = future.result()
                            
                            if pdf_result_path and returned_filename == submitted_filename:
                                successful_captures += 1
                                input_df.loc[original_index, 'Capture Status'] = 'Success'
                                logging.info(f"Successfully processed URL: {res_url} -> PDF: {pdf_result_path} (Filename: {returned_filename})")
                            elif pdf_result_path and returned_filename != submitted_filename:
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Filename Mismatch)'
                                logging.warning(f"Worker returned a PDF but filename mismatched for URL: {original_url}. Expected: {submitted_filename}, Got: {returned_filename}, Path: {pdf_result_path}")
                            else: # pdf_result_path is None
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Worker Returned No PDF)'
                                logging.warning(f"Worker failed to return PDF for URL: {original_url} (Filename: {submitted_filename})")

                        except Exception as exc:
                            logging.error(f"URL {original_url} (Filename: {submitted_filename}) generated an exception in worker process: {exc}", exc_info=True)
                            input_df.loc[original_index, 'Capture Status'] = 'Error (Worker Exception)'
                        finally:
                            processed_count += 1
                            progress = processed_count / total_urls
                            progress_bar.progress(progress)
                            status_text.text(f"Processed {processed_count}/{total_urls} URLs... Success: {successful_captures}")
                
                # --- End of ProcessPoolExecutor block ---

                # Final status update
                status_text.text(f"Processing complete. Processed {processed_count}/{total_urls}.")
                st.success(f"Completed! Successfully captured {successful_captures} out of {total_urls} pages")
                
                # Prepare Excel report for download using the modified input_df
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        worksheet = writer.sheets['Report']
                        # Auto-adjust column widths for all columns in the report
                        for i, col_name in enumerate(input_df.columns):
                            # Calculate max length needed for the column
                            # Consider header length and max data length in the column
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2 # Add a little padding
                            worksheet.set_column(i, i, column_width)

                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Report",
                        data=excel_data,
                        file_name="screenshot_report_final.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                else:
                    st.info("No data to generate a report.")

        except Exception as e:
            st.error(f"An critical error occurred in the main process: {e}")
            logging.error(f"Application error: {e}", exc_info=True)
        finally:
            # No global playwright/browser objects to clean up here anymore
            # Temporary storage state file cleanup is now handled by atexit
            logging.info("Main process try-finally block finished.")

if __name__ == "__main__":
    # Need this guard for ProcessPoolExecutor on some platforms (like Windows)
    main()
