import streamlit as st
import pandas as pd
from playwright.sync_api import sync_playwright, Page, Browser, Playwright
import time
import logging
import os
from PIL import Image
import io
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import tempfile
from pypdf import PdfReader, PdfWriter
import copy
import math
import shutil
import atexit

# Setup logging
logging.basicConfig(
    filename='screenshot_capture.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

# --- Global Constants ---
MAX_PDF_HEIGHT_PX = 14000  # Max height for a single PDF page segment in pixels
MAX_PDF_HEIGHT_PT = MAX_PDF_HEIGHT_PX * 72 / 96  # Convert to points (assuming 96 DPI)
MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT = 13900 # Pages taller than this (in px) trigger minimal cleanup
DEFAULT_VIEWPORT_WIDTH = 1920 # Default viewport width in pixels for PDF capture
DEFAULT_FALLBACK_PAGE_HEIGHT_PX = 1080 # Fallback page height if everything else fails
# --- End Global Constants ---

def cleanup_temp_storage_file():
    """Clean up the temporary storage state file if it exists."""
    file_path = st.session_state.get('temp_storage_file_to_delete')
    if file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
            logging.info(f"ATEIXT: Removed temporary storage state file: {file_path}")
            del st.session_state.temp_storage_file_to_delete # Prevent re-attempts
        except OSError as e:
            logging.error(f"ATEIXT: Error removing temporary storage state file {file_path}: {e}")
    elif file_path:
        logging.info(f"ATEIXT: Temporary storage state file {file_path} not found, skipping removal.")

def initialize_playwright():
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    # Launch browser (headed mode still active for debugging, change to True for production)
    browser = playwright.chromium.launch(headless=False) 
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username: str, password: str, login_url: str) -> str | None:
    """Login to the portal using Playwright, save storage state to a temp file, and return the file path."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        st.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        # Use headless=True for the initial login for efficiency, unless debugging login itself
        browser = playwright.chromium.launch(headless=True) 
        context = browser.new_context(
            viewport={"width": 1920, "height": 1080},
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        st.info("Navigating to login page...")
        page.goto(login_url)
        logging.info("Navigated to login page")
        time.sleep(1)
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        st.info("Entering username...")
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        st.info("Proceeding to password step...")
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        st.info("Entering credentials...")
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        st.info("Submitting login...")
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        st.info("Waiting for login confirmation...")
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        st.success("Login successful! Session state captured.")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
            st.session_state.temp_storage_file_to_delete = storage_state_path # Store for atexit cleanup
        
        return storage_state_path # Return the path

    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        st.error(f"Login failed: {e}")
        return None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file) -> pd.DataFrame:
    """Read all data from Excel file, using the first row as header."""
    try:
        # Use the first row as header, ensure it's treated as such.
        df = pd.read_excel(file, header=0)
        logging.info(f"Read Excel file. Shape: {df.shape}. Columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}")
        st.error(f"Error reading Excel file: {e}")
        return pd.DataFrame() # Return empty DataFrame on error to be checked by caller

def generate_custom_filename(type_of_material: str, name: str, codex: str) -> str:
    """Generate a custom PDF filename from Excel data.
       Example: infografia_Flow3Ciclo3Tomás_ES-DSM-00864.pdf"""
    # Clean type_of_material: remove trailing '_'
    cleaned_type_of_material = str(type_of_material).rstrip('_')
    
    # Clean name: remove all non-alphanumeric characters (keeps letters and numbers)
    cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', str(name))
    
    # Codex is used as is, ensure it's a string
    cleaned_codex = str(codex)
    
    # Concatenate parts, filtering out any empty parts that might result from cleaning or empty inputs
    filename_parts = [part for part in [cleaned_type_of_material, cleaned_name, cleaned_codex] if part]
    
    if not filename_parts: # If all parts are empty
        base_name = "default_filename"
        logging.warning("All parts for filename were empty or cleaned to empty. Using 'default_filename.pdf'.")
    else:
        base_name = "_".join(filename_parts)
    
    # Ensure the final base_name is not empty if parts joined to nothing (e.g. just underscores)
    if not base_name.strip('_'):
        base_name = "fallback_filename"
        logging.warning("Filename parts resulted in an empty or underscore-only name. Using 'fallback_filename.pdf'.")

    return f"{base_name}.pdf"

def remove_trailing_blank_page(pdf_path: str):
    """If the last page of the PDF has no text or XObjects, remove it."""
    try:
        reader = PdfReader(pdf_path)
        total_pages = len(reader.pages)
        if total_pages <= 1:
            return  # nothing to trim
        last_page = reader.pages[-1]
        has_text = False
        try:
            text = last_page.extract_text()
            if text and text.strip():
                has_text = True
        except Exception:
            pass
        has_xobjects = "/XObject" in last_page.get("/Resources", {})
        if not has_text and not has_xobjects:
            # Remove the last page
            writer = PdfWriter()
            for i in range(total_pages - 1):
                writer.add_page(reader.pages[i])
            with open(pdf_path, "wb") as f_out:
                writer.write(f_out)
            logging.info(f"Removed trailing blank page from {pdf_path}")
    except Exception as e:
        logging.warning(f"Could not post-process PDF {pdf_path} to remove blank page: {e}")

def capture_and_save_pdf(page: Page, url: str, output_dir: str, filename: str):
    """Capture full page as PDF, segmented into max 14000px high sections using pypdf."""
    try:
        # --- Start: Capture Browser Console Logs ---
        def log_console_message(msg):
            # Log messages, filtering out any potential noise if necessary
            logging.info(f"[Browser Console] {msg.type.upper()}: {msg.text}")
        
        page.on("console", log_console_message)
        # --- End: Capture Browser Console Logs ---

        page.goto(url, wait_until='load', timeout=60000)
        logging.info(f"Navigated to URL: {url} [Process/Worker]")

        actual_page_height = None # Initialize to None
        # --- Start: Click Expandable Sections & Scroll ---
        try:
            expand_buttons = page.locator('text=/Leer más/i')
            count = expand_buttons.count()
            if count > 0:
                logging.info(f"Found {count} 'Leer más' elements for {url}. Clicking them. [Process/Worker]")
                for i in range(count):
                    try:
                        button = expand_buttons.nth(i)
                        if button.is_visible():
                            button.click(timeout=2000)
                            page.wait_for_timeout(500)
                    except Exception as click_err:
                        logging.warning(f"Could not click 'Leer más' element {i} on {url}: {click_err} [Process/Worker]")
            
            logging.info(f"Scrolling page to trigger lazy loading for {url}... [Process/Worker]")
            total_page_height_js = """
            () => {
                return document.documentElement.scrollHeight;
            }
            """
            actual_page_height = page.evaluate(total_page_height_js)
            viewport_height = page.viewport_size['height'] if page.viewport_size else 768
            current_scroll = 0
            scroll_increment = viewport_height * 0.8
            max_scroll_iterations = 50
            iterations = 0
            while current_scroll < actual_page_height and iterations < max_scroll_iterations:
                page.mouse.wheel(0, scroll_increment)
                page.wait_for_timeout(500)
                current_scroll += scroll_increment
                new_total_page_height = page.evaluate(total_page_height_js)
                if new_total_page_height > actual_page_height:
                    actual_page_height = new_total_page_height
                elif page.evaluate("window.scrollY + window.innerHeight >= document.body.scrollHeight - 10"):
                    logging.info(f"Attempting to generate PDF with effective height: {actual_page_height}px for {url} [Process/Worker]")
                    break
                iterations += 1
            if iterations >= max_scroll_iterations:
                logging.warning(f"Max scroll iterations reached for {url}. Final height: {actual_page_height} [Process/Worker]")
            logging.info(f"Finished scrolling for {url}. Final evaluated height: {actual_page_height}px [Process/Worker]")

        except Exception as scroll_err:
            logging.error(f"Error during clicking/scrolling for {url}: {scroll_err} [Process/Worker]")
        # --- End: Click Expandable Sections & Scroll ---

        if actual_page_height is None:
            logging.error(f"Could not determine page height for {url}. Aborting PDF capture.")
            return None

        pre_cleanup_actual_page_height = actual_page_height # Store height before cleanup
        logging.info(f"Page height for {url} BEFORE cleanup: {pre_cleanup_actual_page_height}px [Process/Worker]")

        # --- Start: JavaScript and CSS for page cleanup ---
        hide_elements_js_script = """
        (isMinimalMode) => {
            const alwaysHideSelectors = [
                // OneTrust & Cookie Banners
                '#onetrust-consent-sdk', '#ot-sdk-btn-floating', '.ot-sdk-show-settings',
                'div[data-testid=\"ot-sdk-show-settings\"]', '#onetrust-pc-btn-handler',
                '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
                // Chat Widgets
                '[id*=\"drift-widget\"]', '[class*=\"drift-widget\"]',
                '[id*=\"hubspot-messages-iframe\"]', '[class*=\"hubspot-messages-iframe\"]',
                'iframe[title*=\"chat\" i][style*=\"position: fixed\"]', 'iframe[name*=\"chat\" i][style*=\"position: fixed\"]'
            ];

            const hideElementsFunc = (selectors) => {
                selectors.forEach(selector => {
                    try {
                        document.querySelectorAll(selector).forEach(el => {
                            el.style.setProperty('display', 'none', 'important');
                            el.style.setProperty('visibility', 'hidden', 'important');
                        });
                    } catch (e) { console.warn('Cascade AI: Error hiding selector:', selector, e); }
                });
            };

            hideElementsFunc(alwaysHideSelectors); // These are always hidden

            // Always manage scrollbars for PDF generation
            document.documentElement.style.overflow = 'hidden';
            document.body.style.overflow = 'hidden';
            document.documentElement.style.scrollbarWidth = 'none'; // For Firefox
            document.body.style.scrollbarWidth = 'none'; // For Firefox

            // --- START: Glide.js Slider PDF Fix (Slider Fix Strategy, Runs Always) ---
            console.log('Cascade AI: Attempting to fix Glide.js sliders for PDF (Slider Fix Strategy, Runs Always)...');
            const sliders = document.querySelectorAll('.mhh-mcn-slider'); // Main slider class

            sliders.forEach(sliderElement => {
                try {
                    // Attempt to find and destroy the Glide instance first
                    if (sliderElement.__glide__ && typeof sliderElement.__glide__.destroy === 'function') {
                        console.log('Cascade AI: Destroying Glide instance for slider:', sliderElement);
                        sliderElement.__glide__.destroy();
                    } else {
                        console.log('Cascade AI: Glide instance or destroy() method not directly found for slider:', sliderElement, '. Proceeding with manual DOM manipulation (Slider Fix Strategy).');
                    }

                    const slidesWrapper = sliderElement.querySelector('.mhh-mcn-slider-slides');
                    // Convert NodeList to Array to safely iterate while removing elements
                    const slides = Array.from(sliderElement.querySelectorAll('.mhh-mcn-slider-slide')); 

                    if (slidesWrapper) {
                        slidesWrapper.style.setProperty('transform', 'none', 'important');
                        slidesWrapper.style.setProperty('width', '100%', 'important');
                        slidesWrapper.style.setProperty('height', 'auto', 'important');
                        slidesWrapper.style.setProperty('display', 'block', 'important');
                        slidesWrapper.style.setProperty('position', 'static', 'important');
                        slidesWrapper.style.setProperty('overflow', 'hidden', 'important');
                        slidesWrapper.style.setProperty('min-height', '0px', 'important');
                        slidesWrapper.style.setProperty('max-height', 'none', 'important');
                    }

                    if (slides && slides.length > 0) {
                        slides.forEach((slide, index) => {
                            // Clear any active class first
                            slide.classList.remove('mhh-mcn-slider-slide--active');
                            slide.style.cssText = ''; // Clear ALL inline styles

                            if (index === 0) { // First slide
                                slide.style.setProperty('display', 'block', 'important');
                                slide.style.setProperty('visibility', 'visible', 'important');
                                slide.style.setProperty('opacity', '1', 'important');
                                slide.style.setProperty('position', 'static', 'important');
                                slide.style.setProperty('transform', 'none', 'important');
                                slide.style.setProperty('width', '100%', 'important');
                                slide.style.setProperty('height', 'auto', 'important');
                                slide.style.setProperty('float', 'none', 'important');
                                slide.style.setProperty('margin', '0', 'important');
                                slide.style.setProperty('box-sizing', 'border-box', 'important');
                                slide.style.setProperty('overflow', 'hidden', 'important');
                                slide.classList.add('mhh-mcn-slider-slide--active'); 
                                console.log('Cascade AI: Styled first slide for PDF (Slider Fix Strategy):', slide);
                            } else { // Other slides - HIDE THEM AGGRESSIVELY
                                slide.style.setProperty('display', 'none', 'important');
                                slide.style.setProperty('visibility', 'hidden', 'important');
                                slide.style.setProperty('opacity', '0', 'important');
                                slide.style.setProperty('position', 'absolute', 'important'); 
                                slide.style.setProperty('width', '0px', 'important');
                                slide.style.setProperty('height', '0px', 'important');
                                slide.style.setProperty('overflow', 'hidden', 'important');
                                slide.style.setProperty('margin', '0', 'important');
                                slide.style.setProperty('padding', '0', 'important');
                                console.log('Cascade AI: Aggressively hid other slide using CSS (Slider Fix Strategy):', slide);
                            }
                        });
                        
                        const firstSlideElement = sliderElement.querySelector('.mhh-mcn-slider-slide:first-child');
                        if(firstSlideElement && !firstSlideElement.classList.contains('mhh-mcn-slider-slide--active')) {
                            firstSlideElement.classList.add('mhh-mcn-slider-slide--active');
                            console.log('Cascade AI: Ensured first slide element has --active class post-manipulation (Slider Fix Strategy).');
                        }
                    }
                } catch (e) {
                    console.warn('Cascade AI: Error processing slider for PDF (Slider Fix Strategy):', sliderElement, e);
                }
            });
            console.log('Cascade AI: Finished Glide.js slider PDF fix attempt (Slider Fix Strategy, Runs Always).');
            // --- END: Glide.js Slider PDF Fix ---

            // If minimal cleanup is requested (for long pages), only the 'alwaysHideSelectors' and scrollbar adjustments are done.
            if (isMinimalMode) { // isMinimalMode is apply_minimal_cleanup_for_long_page passed from Python
                console.log('Cascade AI: Applying MINIMAL cleanup (alwaysHideSelectors + scrollbars only).');
                return;
            }

            // Full cleanup: Hide additional general popups/overlays that are not critical cookies/chats
            console.log('Cascade AI: Applying FULL cleanup (all designated selectors + scrollbars).');

            const conditionalSelectorsToHide = [
                '[class*=\"overlay\"], [class*=\"spinner\"], [class*=\"loading\"], [id*=\"spinner\"]'
                // Add other non-critical popups here if needed
            ];
            hideElementsFunc(conditionalSelectorsToHide);
        }
        """

        simplified_css_to_inject = """
        /* CSS for very long pages, ensuring sidebar stability and minimal interference */
        body, html {
            overflow: hidden !important; /* Critical for Playwright PDF stability */
            height: auto !important; /* Ensure full content height consideration */
            margin: 0 !important;
            padding: 0 !important;
        }

        /* Flexbox layout for sidebar and main content */
        section#mhh_mcn_content.mhh-mcn-content { /* Parent container */
            display: flex !important;
            flex-direction: row !important;
            align-items: flex-start !important;
            position: static !important;
            width: 100% !important; 
            height: auto !important;
            overflow: visible !important; 
        }
        aside.mhh-mcn-sidebar.mhh-mcn-sidebar--is-sticky { /* Sidebar */
            position: static !important; 
            display: block !important; 
            flex: 0 0 260px !important;    
            margin-right: 20px !important; 
            height: auto !important;
            max-height: none !important;
            overflow-y: visible !important; 
            page-break-inside: avoid !important;
        }
        main#mhh_mcn_main.mhh-mcn-content-main { /* Main content */
            position: static !important;
            display: block !important; 
            flex: 1 1 auto !important;     
            min-width: 0; 
            height: auto !important;
            overflow-y: visible !important; 
        }

        /* Ensure critical page structure elements are static */
        header.mhh-mcn-header, footer.mhh-mcn-footer, div.mhh-mcn-header-sticky-wrapper {
            position: static !important;
            transform: none !important;
            top: auto !important; left: auto !important; right: auto !important; bottom: auto !important;
        }
        
        /* Page break and color adjust rules from full CSS that are safe */
        table, figure, img, svg { page-break-before: avoid !important; page-break-inside: avoid !important; }
        tr, td, th { page-break-inside: avoid !important; }
        h1, h2, h3, h4, h5, h6 { page-break-after: avoid !important; page-break-inside: avoid !important; }
        p, ul, ol, li, blockquote { page-break-inside: avoid !important; }
        * { -webkit-print-color-adjust: exact !important; print-color-adjust: exact !important; }
        """

        css_to_inject = """
/* Rules for html/body to prevent excess whitespace and ensure full content height */
        html, body {
            height: auto !important;
            min-height: auto !important; /* Override vh units or other min-heights */
            max-height: none !important;
            overflow: hidden !important; /* Changed from visible, to match simplified_css_to_inject */
            margin: 0 !important;
            padding: 0 !important;
        }

        /* General rule for fixed/sticky elements to make them part of the flow for print - COMMENTED OUT DUE TO BREAKING CAROUSELS/COMPONENTS
        *[style*="position: fixed" i],
        *[style*="position: sticky" i],
        *:-webkit-sticky,
        *:sticky {
            position: static !important;
            top: auto !important;
            left: auto !important;
            right: auto !important;
            bottom: auto !important;
            transform: none !important; /* Also reset transforms */
        }
        */
        /* Specifically reset position for cookie banner for PDF */
        #onetrust-banner-sdk { /* JS hides this, static position is a fallback */
            position: static !important;
            top: auto !important;
            left: auto !important;
            right: auto !important;
            bottom: auto !important;
            transform: none !important; /* Safe for a hidden element */
        }

        /* Specifically reset position for site header for PDF */
        header.site-header { /* This is a general selector, ensure it's the correct one for MSD pages or refine */
            position: static !important;
            top: auto !important;
            left: auto !important;
            right: auto !important;
            bottom: auto !important;
            transform: none !important; 
        }

        /* Specific rules for MSD layout components to ensure correct PDF rendering */
        section#mhh_mcn_content.mhh-mcn-content { /* Parent container for sidebar and main content */
            display: flex !important;
            flex-direction: row !important;
            align-items: flex-start !important;
            position: static !important;
            width: 100% !important; /* Ensure it spans full width to arrange children */
            height: auto !important;
            overflow: visible !important; /* Let children control their overflow */
        }

        /* Sidebar as a flex item */
        aside.mhh-mcn-sidebar.mhh-mcn-sidebar--is-sticky {
            position: static !important; 
            display: block !important; /* Flex items are block-like by default, but explicit is fine */
            flex: 0 0 260px !important;    /* Do not grow, do not shrink, basis 260px */
            margin-right: 20px !important; 
            height: auto !important;
            max-height: none !important;
            overflow-y: visible !important; /* Sidebar content should be visible */
            page-break-inside: avoid !important;
        }

        /* Inner navigation list within the sidebar - should fill the aside */
        nav.js-intra-page-nav__list {
            position: static !important; 
            display: block !important;
            width: 100% !important;        /* Fill the width of the parent aside */
            height: auto !important;
            max-height: none !important;
            overflow-y: visible !important;
            page-break-inside: avoid !important; 
        }
        
        /* Also ensure the sticky area div within aside behaves */
        div.mhh-mcn-sidebar-sticky-area {
            position: static !important;
            width: 100% !important;
            height: auto !important;
            max-height: none !important;
            overflow-y: visible !important;
        }

        /* Main content as a flex item */
        main#mhh_mcn_main.mhh-mcn-content-main { 
            position: static !important;
            display: block !important; /* Flex items are block-like */
            flex: 1 1 auto !important;     /* Grow and shrink to take available space */
            min-width: 0; /* Important for flex items that might contain long unbroken strings or wide fixed-width elements */
            height: auto !important;
            overflow-y: visible !important; /* Or 'auto' if main content itself needs to scroll */
        }

        /* Ensure the overall page container doesn't hide content */
        .product-template-default {
            overflow: visible !important;
        }

        /* 
        COMMENTING OUT CAROUSEL CSS TO TEST SIDEBAR STABILITY 
        -----------------------------------------------------
        /* Carousel fixes for PDF: Force items to be visible and stack */
        /*
        .wp-block-mconnect-theme-carousel .tns-carousel,
        .wp-block-mconnect-theme-carousel .tns-outer,
        .wp-block-mconnect-theme-carousel .tns-viewport,
        .wp-block-mconnect-theme-carousel .tns-inner {
            position: static !important;
            width: auto !important;
            height: auto !important;
            overflow: visible !important;
            transform: none !important;
        }

        /* Hide all carousel items by default for PDF */
        /*
        .wp-block-mconnect-theme-carousel .tns-item {
            display: none !important;
        }

        /* Display only the first carousel item for PDF */
        /*
        .wp-block-mconnect-theme-carousel .tns-item:first-child {
            position: static !important;
            display: block !important; /* Show only the first item */
        /*    float: none !important;
            width: 100% !important;
            height: auto !important;
            margin: 0 !important; /* No margin needed if only one is shown */
        /*    opacity: 1 !important;
            visibility: visible !important;
            transform: none !important;
        }

        /* Ensure the inner columns container in the first slide uses flex */
        /*
        .wp-block-mconnect-theme-carousel .tns-item:first-child .mhh-mcn-columns-inner {
            display: flex !important;
            flex-direction: row !important;
            width: 100% !important; /* Take full width of the slide item */
        /*    align-items: flex-start !important; /* Align items to the top */
        /*}

        /* Style the columns within the first slide */
        /*
        .wp-block-mconnect-theme-carousel .tns-item:first-child .mhh-mcn-v1-column {
            flex: 0 0 50% !important;          /* Do not grow, do not shrink, basis 50% */
        /*    width: 50% !important; /* ADDED: Be extra explicit about width */
        /*    max-width: 50% !important;         /* Enforce max width */
        /*    padding: 0 10px !important;        /* Add some spacing between columns */
        /*    box-sizing: border-box !important;
            overflow: hidden !important;       /* Prevent content from spilling and breaking layout */
        /*    vertical-align: top !important;    /* Align content to the top, just in case */
        /*}
        /*
        .wp-block-mconnect-theme-carousel .tns-item:first-child .mhh-mcn-v1-column:first-child img {
             width: 100% !important;
             height: auto !important;
             object-fit: contain !important; /* Or 'cover' if preferred, but 'contain' ensures full visibility */
        /*}

        /* Hide carousel controls as they are non-functional in PDF */
        /*
        .wp-block-mconnect-theme-carousel .carousel-controls-container,
        .wp-block-mconnect-theme-carousel [class*='tns-controls'], /* Catches tns-controls and similar */
        /*
        .wp-block-mconnect-theme-carousel .tns-nav {
            display: none !important;
        }
        -----------------------------------------------------
        END OF COMMENTED OUT CAROUSEL CSS
        */

        /* General fix for mhh-mcn-columns layout system (Patologías, Destacados, Productos, etc.) */
        .mhh-mcn-columns { /* The flex container for columns */
            display: flex !important;
            flex-wrap: nowrap !important; /* Assumes content per row is defined by --count-N */
            justify-content: space-between !important; /* Distributes items with space */
            width: 100% !important;
            visibility: visible !important;
            opacity: 1 !important;
            padding: 0 !important; /* Reset padding */
            margin: 0 auto !important; /* Center if it's a block, reset vertical margin */
            list-style: none !important; /* Reset list styles if applicable */
            box-sizing: border-box !important;
        }

        .mhh-mcn-columns .mhh-mcn-column { /* Individual column items */
            opacity: 1 !important;
            visibility: visible !important;
            page-break-inside: avoid !important;
            box-sizing: border-box !important;
            margin: 0 !important; /* Parent handles spacing */
            padding: 8px !important; /* Inner padding for content within column, adjust as needed */
            border: none !important; /* Reset borders */
        }

        /* Specific column counts using flex-basis */
        .mhh-mcn-columns.mhh-mcn-columns--count-1 .mhh-mcn-column {
            flex: 0 0 100% !important;
            max-width: 100% !important;
        }
        .mhh-mcn-columns.mhh-mcn-columns--count-2 .mhh-mcn-column {
            flex: 0 0 48.5% !important; /* Approx for 2 columns with space-between */
            max-width: 48.5% !important;
        }
        .mhh-mcn-columns.mhh-mcn-columns--count-3 .mhh-mcn-column {
            flex: 0 0 32% !important;   /* Approx for 3 columns with space-between */
            max-width: 32% !important;
        }
        .mhh-mcn-columns.mhh-mcn-columns--count-4 .mhh-mcn-column {
            flex: 0 0 23.5% !important; /* Approx for 4 columns with space-between */
            max-width: 23.5% !important;
        }
        /* Add more .mhh-mcn-columns--count-N rules if other counts exist (e.g., 5 or 6) */

        /* Images within these columns */
        .mhh-mcn-columns .mhh-mcn-column img {
            max-width: 100% !important;
            height: auto !important;
            object-fit: contain !important;
            display: block !important;
            margin-bottom: 8px !important; /* Space below image if text follows */
        }

        /* --- START: Commented out image styling for simplification ---
        img {
            max-width: 100% !important;
            height: auto !important;
        }

        .wp-block-mconnect-theme-carousel img,
        .tns-carousel img {
            height: unset !important;
            object-fit: unset !important;
        }

        .wp-block-mconnect-theme-curated-content img {
            height: unset !important;
            object-fit: unset !important;
        }
        --- END: Commented out image styling for simplification --- */

        /* --- START: Glide.js Slider - First Slide Two-Column Layout for PDF --- */
        /* Ensure the main slider container and track are static and allow overflow for content */
        .mhh-mcn-slider {
            position: static !important;
            overflow: visible !important;
            height: auto !important;
            width: 100% !important;
        }
        .mhh-mcn-slider .mhh-mcn-slider-track {
            position: static !important;
            overflow: visible !important;
            height: auto !important;
            width: 100% !important;
        }

        /* Reset transformations and styles on the slides wrapper for PDF */
        .mhh-mcn-slider .mhh-mcn-slider-slides {
            transform: none !important;
            width: 100% !important; /* Ensure it takes full width of its static parent */
            height: auto !important;
            display: block !important; /* Display as a block, not flex or other JS-driven layout */
            position: static !important;
            opacity: 1 !important;
            visibility: visible !important;
        }

        /* --- Slide Visibility Control: Show only the FIRST slide --- */
        /* 1. Hide all slides by default */
        .mhh-mcn-slider .mhh-mcn-slider-slide {
            display: none !important;
            visibility: hidden !important;
            opacity: 0 !important;
            width: 100% !important; 
            position: static !important; 
            margin: 0 !important; 
            float: none !important; 
        }
        /* 2. Explicitly show ONLY the .mhh-mcn-slider-slide:first-child */
        .mhh-mcn-slider .mhh-mcn-slider-slide:first-child {
            display: block !important;
            visibility: visible !important;
            opacity: 1 !important;
            height: auto !important;
            width: 100% !important; 
            position: static !important;
        }
        /* 3. Ensure any other slide that might be --active (but is not the first-child) remains hidden. */
        /*    This is a safeguard. The primary rule is to show only :first-child. */
        .mhh-mcn-slider .mhh-mcn-slider-slide--active:not(:first-child) {
             display: none !important; 
             visibility: hidden !important;
             opacity: 0 !important;
        }
        /* --- End Slide Visibility Control --- */


        /* Apply flex to the inner columns container of the active/first slide */
        .mhh-mcn-slider .mhh-mcn-slider-slide:first-child .mhh-mcn-columns-inner,
        .mhh-mcn-slider .mhh-mcn-slider-slide--active .mhh-mcn-columns-inner {
            display: flex !important;
            flex-direction: row !important;
            flex-wrap: nowrap !important;
            width: 100% !important; 
            align-items: flex-start !important;
            visibility: visible !important;
            opacity: 1 !important;
            position: static !important;
            background-color: transparent !important; /* Ensure no weird background colors */
        }

        /* Style the two columns within the active/first slide's columns-inner */
        .mhh-mcn-slider .mhh-mcn-slider-slide:first-child .mhh-mcn-v1-column,
        .mhh-mcn-slider .mhh-mcn-slider-slide--active .mhh-mcn-v1-column {
            flex: 0 0 50% !important;         
            max-width: 50% !important;        
            padding: 0 10px !important;       
            box-sizing: border-box !important;
            display: block !important;
            visibility: visible !important;
            opacity: 1 !important;
            position: static !important;
            margin: 0 !important; /* Reset margins */
            float: none !important; /* Clear floats */
        }
        
        /* Ensure images within these columns behave */
        .mhh-mcn-slider .mhh-mcn-slider-slide:first-child .mhh-mcn-v1-column img,
        .mhh-mcn-slider .mhh-mcn-slider-slide--active .mhh-mcn-v1-column img {
             width: 100% !important;
             height: auto !important;
             object-fit: contain !important;
             display: block !important;
        }

        /* Hide slider controls and navigation elements */
        .mhh-mcn-slider .mhh-mcn-slider-control-panel,
        .mhh-mcn-slider .mhh-mcn-slider-navigation,
        .mhh-mcn-slider .mhh-mcn-slider-controls,
        .mhh-mcn-slider [data-glide-el*=\"arrow\"],
        .mhh-mcn-slider [data-glide-el=\"controls\"],
        .mhh-mcn-slider [data-glide-el=\"bullets\"] {
            display: none !important;
            visibility: hidden !important;
            opacity: 0 !important;
        }
        /* --- END: Glide.js Slider - First Slide Two-Column Layout for PDF --- */

        table, figure, img, svg { page-break-before: avoid !important; page-break-inside: avoid !important; }
        tr, td, th { page-break-inside: avoid !important; }
        h1, h2, h3, h4, h5, h6 { page-break-after: avoid !important; page-break-inside: avoid !important; }
        p, ul, ol, li, blockquote { page-break-inside: avoid !important; }
        * { -webkit-print-color-adjust: exact !important; print-color-adjust: exact !important; }
        """

        apply_minimal_cleanup_for_long_page = pre_cleanup_actual_page_height > MAX_CAPTURE_HEIGHT_SINGLE_SEGMENT

        # Emulate print media BEFORE applying styles and measuring final height
        try:
            page.emulate_media(media="print")
            logging.info(f"Emulated print media for {url}. [Process/Worker]")
        except Exception as e_emulate:
            logging.warning(f"Could not emulate print media for {url}: {e_emulate} [Process/Worker]")

        try:
            logging.info(f"Applying JS cleanup for {url} (minimal mode: {apply_minimal_cleanup_for_long_page}) [Process/Worker]")
            page.evaluate(hide_elements_js_script, apply_minimal_cleanup_for_long_page) # Pass the boolean flag
            
            if apply_minimal_cleanup_for_long_page:
                logging.info(f"Applying SIMPLIFIED CSS for long page: {url} [Process/Worker]")
                page.add_style_tag(content=simplified_css_to_inject)
            else:
                logging.info(f"Applying FULL CSS for standard page: {url} [Process/Worker]")
                page.add_style_tag(content=css_to_inject) # Ensure full CSS is used here
            logging.info("Waiting for styles to apply...")
            page.wait_for_timeout(1000) # Wait 1 second
            logging.info(f"JS and CSS for page cleanup applied for {url}. [Process/Worker]")
        except Exception as e_js_css:
            logging.error(f"Error applying JS/CSS cleanup for {url}: {e_js_css} [Process/Worker]")
        # --- End: JavaScript and CSS for page cleanup ---

        # Re-evaluate page height AFTER cleanup, as hiding elements can change it.
        final_actual_page_height_for_pdf = page.evaluate(total_page_height_js)
        logging.info(f"Page height for {url} AFTER cleanup: {final_actual_page_height_for_pdf}px [Process/Worker]")

        # Check if page height changed significantly, indicating potential layout issues from hiding elements.
        if abs(final_actual_page_height_for_pdf - pre_cleanup_actual_page_height) > 1000 and not apply_minimal_cleanup_for_long_page:
            logging.warning(f"Significant page height change for {url} after full cleanup: {pre_cleanup_actual_page_height}px to {final_actual_page_height_for_pdf}px. This might indicate issues. [Process/Worker]")
        elif final_actual_page_height_for_pdf <= 100: # Arbitrary small value, page is likely blank or error
            logging.warning(f"Page height for {url} is suspiciously small ({final_actual_page_height_for_pdf}px) after cleanup. Check for errors. [Process/Worker]")

        # Ensure there's a valid height to use for PDF generation
        if final_actual_page_height_for_pdf <= 0:
            logging.error(f"Page height for {url} is zero or negative after cleanup. Using pre-cleanup height: {pre_cleanup_actual_page_height}px. [Process/Worker]")
            final_actual_page_height_for_pdf = pre_cleanup_actual_page_height
            if final_actual_page_height_for_pdf <= 0: # Still zero, fallback to a default
                final_actual_page_height_for_pdf = DEFAULT_FALLBACK_PAGE_HEIGHT_PX
                logging.error(f"Pre-cleanup height also invalid. Using default fallback height: {DEFAULT_FALLBACK_PAGE_HEIGHT_PX}px for {url}. [Process/Worker]")

        # --- Start: PDF Generation ---
        output_pdf_writer = PdfWriter()
        temp_segment_paths = [] # To store paths of temporary segment PDFs for cleanup

        # Define PDF dimensions and filename
        pdf_width_px_str = f"{DEFAULT_VIEWPORT_WIDTH}px"
        final_pdf_path = os.path.join(output_dir, filename)
        logging.info(f"Attempting to generate PDF with effective height: {final_actual_page_height_for_pdf}px for {url} [Process/Worker]")

        try:
            # Diagnostic screenshot (do this once before single/multi decision)
            # if apply_minimal_cleanup_for_long_page:
            #     diagnostic_png_filename = f"{os.path.splitext(filename)[0]}_diagnostic_before_pdf.png"
            #     diagnostic_png_path = os.path.join(output_dir, diagnostic_png_filename)
            #     try:
            #         page.screenshot(path=diagnostic_png_path, full_page=True, timeout=30000)
            #         logging.info(f"Saved diagnostic screenshot to {diagnostic_png_path} for {url} [Process/Worker]")
            #     except Exception as e_diag_png:
            #         logging.error(f"Failed to save diagnostic screenshot for {url}: {e_diag_png} [Process/Worker]")

            if final_actual_page_height_for_pdf <= MAX_PDF_HEIGHT_PX:
                # Page is short enough for a single PDF segment, capture directly
                logging.info(f"Page height {final_actual_page_height_for_pdf}px is within single segment limit {MAX_PDF_HEIGHT_PX}px. Capturing directly for {url}. [Process/Worker]")
                with tempfile.NamedTemporaryFile(delete=False, suffix="_single.pdf") as tmp_single_pdf:
                    single_pdf_path = tmp_single_pdf.name
                temp_segment_paths.append(single_pdf_path) # Add for cleanup

                page.pdf(
                    path=single_pdf_path,
                    print_background=True,
                    width=pdf_width_px_str,
                    height=f"{final_actual_page_height_for_pdf}px",
                    margin={"top": "0px", "bottom": "0px", "left": "0px", "right": "0px"}
                )
                output_pdf_writer.append(single_pdf_path)
                logging.info(f"Single segment PDF saved to {single_pdf_path} for {url} [Process/Worker]")
            else:
                # Page is too tall. Generate one tall PDF and then use pypdf to crop it into segments.
                logging.info(f"Page height {final_actual_page_height_for_pdf}px exceeds single segment limit {MAX_PDF_HEIGHT_PX}px. Generating full tall PDF for segmentation for {url}. [Process/Worker]")
                
                with tempfile.NamedTemporaryFile(delete=False, suffix="_full_tall.pdf") as tmp_tall_pdf:
                    temp_tall_pdf_path = tmp_tall_pdf.name
                temp_segment_paths.append(temp_tall_pdf_path) # Add for cleanup

                page.pdf(
                    path=temp_tall_pdf_path,
                    print_background=True,
                    width=pdf_width_px_str,
                    height=f"{final_actual_page_height_for_pdf}px", # Full height
                    margin={"top": "0px", "bottom": "0px", "left": "0px", "right": "0px"}
                )
                logging.info(f"Full tall temporary PDF saved to {temp_tall_pdf_path} for {url} [Process/Worker]")

                # # Save a debug copy of the full tall PDF before cropping (Now commented out as per user request)
                # debug_tall_pdf_filename = f"DEBUG_TALL_{filename}"
                # debug_tall_pdf_path = os.path.join(output_dir, debug_tall_pdf_filename)
                # try:
                #     shutil.copy(temp_tall_pdf_path, debug_tall_pdf_path)
                #     logging.info(f"Saved debug copy of tall PDF to: {debug_tall_pdf_path} [Process/Worker]")
                # except Exception as e_copy_debug:
                #     logging.warning(f"Could not save debug copy of tall PDF {temp_tall_pdf_path} to {debug_tall_pdf_path}: {e_copy_debug} [Process/Worker]")

                # Add the full tall PDF directly to the output writer
                output_pdf_writer.append(temp_tall_pdf_path)
                logging.info(f"Added full tall PDF {temp_tall_pdf_path} directly to the final output for {url}. [Process/Worker]")

                # --- Start of Commented Out Cropping Logic ---
                # reader = PdfReader(temp_tall_pdf_path)
                # original_tall_page = reader.pages[0]

                # # Convert dimensions to points for pypdf operations
                # # Playwright's page.pdf width/height are in pixels if units aren't specified, 
                # # but pypdf boxes are in points (1/72 inch).
                # # Assuming PDF is rendered by Playwright as if on a 96 DPI screen for pixel calculations.
                # points_per_pixel = 72 / 96.0

                # # pdf_width_px_str is available in this scope (e.g., "1280px")
                # # Parse the integer value from it.
                # parsed_pdf_width_px = int(pdf_width_px_str.replace('px', ''))
                # page_width_pt = parsed_pdf_width_px * points_per_pixel
                # original_page_height_pt = final_actual_page_height_for_pdf * points_per_pixel
                # # MAX_PDF_HEIGHT_PT is already defined globally and correctly in points.

                # if MAX_PDF_HEIGHT_PT <= 0:
                #     logging.error(f"MAX_PDF_HEIGHT_PT is zero or negative ({MAX_PDF_HEIGHT_PT}), cannot segment. Skipping segmentation for {url}.")
                #     # Fallback: add the uncropped tall PDF (or handle error appropriately)
                #     # output_pdf_writer.append(temp_tall_pdf_path) # Appending the uncropped tall one as a fallback
                # else:
                #     num_segments = math.ceil(original_page_height_pt / MAX_PDF_HEIGHT_PT)
                #     logging.info(f"Calculated {num_segments} segments for tall PDF {url}. Original height: {original_page_height_pt:.2f}pt, Max segment: {MAX_PDF_HEIGHT_PT:.2f}pt. [Process/Worker]")

                #     for i in range(int(num_segments)):
                #         y_offset_from_top_pt = i * MAX_PDF_HEIGHT_PT
                #         current_segment_height_pt = min(MAX_PDF_HEIGHT_PT, original_page_height_pt - y_offset_from_top_pt)

                #         if current_segment_height_pt <= 0.01: # Check against a small epsilon for float comparisons
                #             logging.warning(f"Skipping segment {i+1} due to very small or negative height ({current_segment_height_pt:.2f}pt) for {url}.")
                #             continue

                #         # Cropbox coordinates (points) from the original tall page's coordinate system
                #         # (0,0) for cropbox is bottom-left of the original_tall_page
                #         cb_llx_pt = 0
                #         # Cropbox LowerY: from bottom of original page, it's (total_height - offset_from_top - segment_height)
                #         cb_lly_pt = original_page_height_pt - y_offset_from_top_pt - current_segment_height_pt
                #         cb_urx_pt = page_width_pt
                #         # Cropbox UpperY: from bottom of original page, it's (total_height - offset_from_top)
                #         cb_ury_pt = original_page_height_pt - y_offset_from_top_pt
                        
                #         # Create a temporary file for this specific cropped segment
                #         with tempfile.NamedTemporaryFile(delete=False, suffix=f"_segment_{i+1}.pdf") as tmp_cropped_segment_file:
                #             segment_pdf_path = tmp_cropped_segment_file.name
                #         temp_segment_paths.append(segment_pdf_path) # Add for cleanup

                #         # Create a new PdfWriter for this single segment
                #         single_segment_writer = PdfWriter()
                #         # Add a *copy* of the original tall page's content to this new writer
                #         # Making a copy is important if modifying boxes on the same page object multiple times
                #         page_copy_for_segment = copy.copy(original_tall_page) 
                #         single_segment_writer.add_page(page_copy_for_segment)
                        
                #         # Access the page within this new writer to modify its boxes
                #         page_to_crop_in_segment_writer = single_segment_writer.pages[0]
                        
                #         # Set the MediaBox of this new page to be the actual size of the segment
                #         # (0,0) for mediabox is bottom-left for the new page's own coordinate system
                #         page_to_crop_in_segment_writer.mediabox.lower_left = (0, 0)
                #         page_to_crop_in_segment_writer.mediabox.upper_right = (page_width_pt, current_segment_height_pt)
                        
                #         # Set the CropBox to select the content from the original tall page's coordinate system
                #         page_to_crop_in_segment_writer.cropbox.lower_left = (cb_llx_pt, cb_lly_pt)
                #         page_to_crop_in_segment_writer.cropbox.upper_right = (cb_urx_pt, cb_ury_pt)

                #         with open(segment_pdf_path, "wb") as f_segment_out:
                #             single_segment_writer.write(f_segment_out)
                        
                #         output_pdf_writer.append(segment_pdf_path)
                #         logging.info(f"Cropped segment {i+1}/{num_segments} from tall PDF, saved to {segment_pdf_path}, and added to final PDF for {url}. CropBox LLY: {cb_lly_pt:.2f}pt, URY: {cb_ury_pt:.2f}pt. Segment Height: {current_segment_height_pt:.2f}pt [Process/Worker]")
                # --- End of Commented Out Cropping Logic ---

                    # with open(segment_pdf_path, "wb") as f_segment_out:
                    #     single_segment_writer.write(f_segment_out)
                    # 
                    # output_pdf_writer.append(segment_pdf_path)
                    # logging.info(f"Cropped segment {i+1}/{num_segments} from tall PDF, saved to {segment_pdf_path}, and added to final PDF for {url}. [Process/Worker]")
            
            with open(final_pdf_path, "wb") as f_out:
                output_pdf_writer.write(f_out)
            logging.info(f"Final combined PDF saved: {final_pdf_path} for {url} [Process/Worker]")

        finally:
            # Clean up temporary segment PDFs
            for temp_path in temp_segment_paths:
                if os.path.exists(temp_path):
                    try:
                        os.remove(temp_path)
                        logging.debug(f"Cleaned up temporary segment PDF: {temp_path} [Process/Worker]")
                    except Exception as e_clean_temp:
                        logging.warning(f"Could not clean up temporary segment PDF {temp_path}: {e_clean_temp} [Process/Worker]")
            # Ensure temp_tall_pdf_path (if it was ever created due to an error before this refactor) is cleaned up
            # This check can be removed later if temp_tall_pdf_path is fully purged from the function's logic paths
            if 'temp_tall_pdf_path' in locals() and temp_tall_pdf_path and os.path.exists(temp_tall_pdf_path):
                try:
                    os.remove(temp_tall_pdf_path)
                    logging.debug(f"Cleaned up vestigial temp_tall_pdf_path: {temp_tall_pdf_path} [Process/Worker]")
                except Exception as e_remove_vestigial:
                    logging.warning(f"Could not remove vestigial temp_tall_pdf_path {temp_tall_pdf_path}: {e_remove_vestigial} [Process/Worker]")
        return final_pdf_path

    except Exception as e:
        logging.error(f"Error in capture_and_save_pdf for {url} [Process/Worker]: {e}", exc_info=True)
        # Clean up temp tall PDF if error occurred mid-process
        if 'temp_tall_pdf_path' in locals() and temp_tall_pdf_path and os.path.exists(temp_tall_pdf_path):
            try:
                os.remove(temp_tall_pdf_path)
            except Exception as e_remove_err:
                logging.warning(f"Could not remove temp tall PDF {temp_tall_pdf_path} during error cleanup: {e_remove_err} [Process/Worker]")
        return None

def process_worker_task(storage_state_path: str, url: str, output_dir: str, filename: str):
    """Worker function to run in a separate process. Returns url, path to PDF, and the filename used."""
    playwright = None
    browser = None
    context = None
    page = None
    try:
        logging.info(f"Process worker starting for URL: {url}")
        # Initialize Playwright within the process
        playwright = sync_playwright().start()
        # Launch browser within the process (headless is generally better for workers)
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        try:
            with open(storage_state_path, 'r') as f:
                storage_state = json.load(f)
        except Exception as load_err:
            logging.error(f"Process worker for {url} failed to load storage state from {storage_state_path}: {load_err}")
            raise # Re-raise the error to signify failure

        # Create context using the loaded state
        context = browser.new_context(
            storage_state=storage_state,
            viewport={"width": DEFAULT_VIEWPORT_WIDTH, "height": 1080}, # Use constant
            device_scale_factor=1,
            is_mobile=False,
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Call the capture function
        pdf_result_path = capture_and_save_pdf(page, url, output_dir, filename)
        
        # If PDF was created, attempt to remove trailing blank page
        if pdf_result_path:
            logging.info(f"Attempting to remove trailing blank page from: {pdf_result_path} [Process/Worker]")
            remove_trailing_blank_page(pdf_result_path)
        
        # Return the original URL, the path to the saved PDF (if successful), and the filename used.
        # This helps in mapping results back in the main process.
        return url, pdf_result_path, filename
    except Exception as e:
        logging.error(f"Error in process_worker_task for {url}: {e}", exc_info=True)
        return url, None, filename # Return URL, None for path, and filename for failure tracking
    finally:
        # Ensure cleanup within the process
        if page: 
            try: page.close() 
            except Exception as e: logging.debug(f"Error closing page for {url}: {e}")
        if context: 
            try: context.close() 
            except Exception as e: logging.debug(f"Error closing context for {url}: {e}")
        if browser: 
            try: browser.close() 
            except Exception as e: logging.debug(f"Error closing browser for {url}: {e}")
        if playwright: 
            try: playwright.stop() 
            except Exception as e: logging.debug(f"Error stopping playwright for {url}: {e}")
        logging.info(f"Process worker finished for URL: {url}")

def main():
    atexit.register(cleanup_temp_storage_file) # Register cleanup function
    storage_state_path = None  # Initialize to None
    st.title("MSD Portal Screenshot Capture")
    
    # File upload
    uploaded_file = st.file_uploader("Upload Excel file with URLs", type=['xlsx', 'xls'])
    
    if uploaded_file is not None:
        try:
            input_df = read_excel(uploaded_file)
            if input_df.empty:
                st.error("Failed to read Excel file or it's empty.")
                return

            # Define 0-indexed column indices for data extraction
            # Col B: Type of Material, Col C: Name, Col D: Codex, Col F: URL
            TYPE_COL_IDX = 1 
            NAME_COL_IDX = 2
            CODEX_COL_IDX = 3
            URL_COL_IDX = 5

            # Verify required columns exist by checking indices against df.columns length
            max_required_idx = max(TYPE_COL_IDX, NAME_COL_IDX, CODEX_COL_IDX, URL_COL_IDX)
            if max_required_idx >= len(input_df.columns):
                st.error(f"Excel file is missing required columns. It has {len(input_df.columns)} columns, but needs at least {max_required_idx + 1} for all data (URL, Name, Type, Codex). Please check columns B, C, D, and F.")
                logging.error(f"Excel column check failed. Found columns: {input_df.columns.tolist()}. Required max index: {max_required_idx}")
                return

            input_df['PDF Final Name'] = ""
            input_df['Capture Status'] = "Pending"
            
            tasks_to_submit = []
            for index, row in input_df.iterrows():
                try:
                    # Safely access data using iloc for position-based access
                    type_of_material = str(row.iloc[TYPE_COL_IDX]) if pd.notna(row.iloc[TYPE_COL_IDX]) else ""
                    name = str(row.iloc[NAME_COL_IDX]) if pd.notna(row.iloc[NAME_COL_IDX]) else ""
                    codex = str(row.iloc[CODEX_COL_IDX]) if pd.notna(row.iloc[CODEX_COL_IDX]) else ""
                    url = str(row.iloc[URL_COL_IDX]) if pd.notna(row.iloc[URL_COL_IDX]) else ""

                    if not url.strip() or not url.lower().startswith(('http://', 'https://')):
                        logging.warning(f"Skipping row {index + 2} (Excel row number) due to invalid or empty URL: '{url}'")
                        input_df.loc[index, 'PDF Final Name'] = "SKIPPED_INVALID_URL"
                        input_df.loc[index, 'Capture Status'] = "Skipped (Invalid URL)"
                        continue
                    
                    pdf_final_name = generate_custom_filename(type_of_material, name, codex)
                    input_df.loc[index, 'PDF Final Name'] = pdf_final_name
                    tasks_to_submit.append({'url': url, 'filename': pdf_final_name, 'original_index': index})
                
                except IndexError:
                    err_msg = f"Error processing row {index + 2} in Excel. Not enough columns or incorrect structure."
                    st.warning(err_msg) # Use warning for individual row errors to not halt all
                    logging.error(f"{err_msg} Available columns in row: {len(row)} based on df structure: {len(input_df.columns)}.")
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_STRUCTURE"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
                except Exception as e:
                    err_msg = f"Unexpected error processing row {index + 2} for filename generation: {e}"
                    st.warning(err_msg)
                    logging.error(err_msg, exc_info=True)
                    input_df.loc[index, 'PDF Final Name'] = "ERROR_PROCESSING_ROW_UNKNOWN"
                    input_df.loc[index, 'Capture Status'] = "Error"
                    continue
            
            urls_to_process_count = len(tasks_to_submit)
            if urls_to_process_count == 0:
                st.info("No valid URLs found to process after initial parsing.")
                # Still provide the modified Excel for download if rows were skipped/errored
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        # Adjust column widths as needed, example:
                        worksheet = writer.sheets['Report']
                        for i, col_name in enumerate(input_df.columns):
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2
                            worksheet.set_column(i, i, column_width)
                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Initial Report (No Captures Done)",
                        data=excel_data,
                        file_name="screenshot_report_initial.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                return

            total_urls = urls_to_process_count # This is the count of tasks actually submitted

            # Login credentials
            username = st.text_input("Username")
            password = st.text_input("Password", type="password")
            login_url = "https://profesionales.msd.es/login"
            
            # storage_state_path is initialized at line 682. Redundant initialization removed.

            if st.button("Start Capture"):
                if not username or not password:
                    st.error("Please enter username and password")
                    return
                
                # Login (once) and get path to storage state file
                storage_state_path = login_to_portal(username, password, login_url)
                if storage_state_path is None:
                    st.error("Login failed. Cannot proceed.")
                    return # Exit if login failed
                
                # Create a unique directory for this batch of PDFs - MOVED HERE
                timestamp = time.strftime("%Y%m%d-%H%M%S")
                pdf_dir = os.path.join("pdf_outputs", timestamp)
                os.makedirs(pdf_dir, exist_ok=True)
                st.success(f"PDFs will be saved in: {os.path.abspath(pdf_dir)}")
                
                progress_bar = st.progress(0.0)
                status_text = st.empty()
                processed_count = 0
                successful_captures = 0

                max_processes = min(os.cpu_count() or 1, total_urls)
                logging.info(f"Using {max_processes} worker processes for {total_urls} tasks.")

                with ProcessPoolExecutor(max_workers=max_processes) as executor:
                    futures = {
                        executor.submit(process_worker_task, storage_state_path, task['url'], pdf_dir, task['filename']): task
                        for task in tasks_to_submit
                    }
                    
                    status_text.text(f"Submitted {total_urls} URLs to {max_processes} processes. Processing...")

                    for future in as_completed(futures):
                        task_info = futures[future]
                        original_index = task_info['original_index']
                        original_url = task_info['url']
                        submitted_filename = task_info['filename'] # This is the 'PDF Final Name'

                        try:
                            # Result from process_worker_task is (url, pdf_result_path, returned_filename)
                            res_url, pdf_result_path, returned_filename = future.result()
                            
                            if pdf_result_path and returned_filename == submitted_filename:
                                successful_captures += 1
                                input_df.loc[original_index, 'Capture Status'] = 'Success'
                                logging.info(f"Successfully processed URL: {res_url} -> PDF: {pdf_result_path} (Filename: {returned_filename})")
                            elif pdf_result_path and returned_filename != submitted_filename:
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Filename Mismatch)'
                                logging.warning(f"Worker returned a PDF but filename mismatched for URL: {original_url}. Expected: {submitted_filename}, Got: {returned_filename}, Path: {pdf_result_path}")
                            else: # pdf_result_path is None
                                input_df.loc[original_index, 'Capture Status'] = 'Failed (Worker Returned No PDF)'
                                logging.warning(f"Worker failed to return PDF for URL: {original_url} (Filename: {submitted_filename})")

                        except Exception as exc:
                            logging.error(f"URL {original_url} (Filename: {submitted_filename}) generated an exception in worker process: {exc}", exc_info=True)
                            input_df.loc[original_index, 'Capture Status'] = 'Error (Worker Exception)'
                        finally:
                            processed_count += 1
                            progress = processed_count / total_urls
                            progress_bar.progress(progress)
                            status_text.text(f"Processed {processed_count}/{total_urls} URLs... Success: {successful_captures}")
                
                # --- End of ProcessPoolExecutor block ---

                # Final status update
                status_text.text(f"Processing complete. Processed {processed_count}/{total_urls}.")
                st.success(f"Completed! Successfully captured {successful_captures} out of {total_urls} pages")
                
                # Prepare Excel report for download using the modified input_df
                if not input_df.empty:
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                        input_df.to_excel(writer, index=False, sheet_name='Report')
                        worksheet = writer.sheets['Report']
                        # Auto-adjust column widths for all columns in the report
                        for i, col_name in enumerate(input_df.columns):
                            # Calculate max length needed for the column
                            # Consider header length and max data length in the column
                            max_data_len = input_df[col_name].astype(str).map(len).max() if not input_df[col_name].empty else 0
                            column_width = max(len(str(col_name)), max_data_len) + 2 # Add a little padding
                            worksheet.set_column(i, i, column_width)

                    excel_data = output.getvalue()
                    st.download_button(
                        label="Download Report",
                        data=excel_data,
                        file_name="screenshot_report_final.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
                else:
                    st.info("No data to generate a report.")

        except Exception as e:
            st.error(f"An critical error occurred in the main process: {e}")
            logging.error(f"Application error: {e}", exc_info=True)
        finally:
            # No global playwright/browser objects to clean up here anymore
            # Temporary storage state file cleanup is now handled by atexit
            logging.info("Main process try-finally block finished.")

if __name__ == "__main__":
    # Need this guard for ProcessPoolExecutor on some platforms (like Windows)
    main()