#!/usr/bin/env python3
import os
import sys
import argparse
import pandas as pd
import time
import logging
import requests
from bs4 import BeautifulSoup
import datetime
import json
import tempfile
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
import xlsxwriter
from io import BytesIO
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email.utils import formatdate
from email import encoders
import threading

# Setup logging
log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'link_checker_daemon_multiuser.log')
logging.basicConfig(
    filename=log_file,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logging.info("Starting app_daemon_multiuser.py with log file: %s", log_file)

# Global cache for link statuses to avoid re-checking the same URL
checked_links = {}
checked_links_lock = threading.Lock()

def initialize_playwright(headless=True):
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=headless)
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username, password, login_url, headless=True):
    """Login to the portal and save storage state to a temp file for reuse"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        logging.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=headless)
        context = browser.new_context()
        page = context.new_page()
        
        logging.info("Navigating to login page...")
        page.goto(login_url)
        logging.info(f"Current URL after navigate: {page.url}")
        page.screenshot(path="debug_screenshot_01_navigate.png")
        time.sleep(1)
        
        # Accept cookies if present
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                logging.info(f"Current URL after cookies: {page.url}")
                page.screenshot(path="debug_screenshot_02_cookies_accepted.png")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")
            logging.info(f"Current URL cookies not found: {page.url}")
            page.screenshot(path="debug_screenshot_02_cookies_not_found.png")

        # Enter username
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        logging.info(f"Current URL after username: {page.url}")
        page.screenshot(path="debug_screenshot_03_username_entered.png")
        
        # Click next
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        logging.info(f"Current URL after next: {page.url}")
        page.screenshot(path="debug_screenshot_04_next_clicked.png")
        
        # Re-enter username and password
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
            logging.info(f"Current URL after username retry: {page.url}")
            page.screenshot(path="debug_screenshot_05_username_retry.png")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        logging.info(f"Current URL after password: {page.url}")
        page.screenshot(path="debug_screenshot_06_password_entered.png")
        
        # Submit
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")
        logging.info(f"Current URL after submit: {page.url}")
        page.screenshot(path="debug_screenshot_07_submit_clicked.png")

        # Wait for login confirmation
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        page.screenshot(path="debug_screenshot_08_login_success.png")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
        
        return True, storage_state_path
    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        if page:
            page.screenshot(path="debug_screenshot_09_error.png")
        return False, None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file_path, test_limit=None):
    """Read URLs from Excel file"""
    try:
        df = pd.read_excel(file_path)
        if 'URL' not in df.columns:
            raise KeyError("Excel file does not contain 'URL' column")
        
        # If test limit is set, only return the first N rows
        if test_limit and test_limit > 0:
            logging.info(f"TEST MODE: Limiting to first {test_limit} URLs")
            df = df.head(test_limit)
            
        logging.info(f"Excel file read successfully: {file_path}, processing {len(df)} URLs")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}", exc_info=True)
        raise

def read_txt_urls(file_path):
    """Read URLs from txt file, one per line"""
    try:
        with open(file_path, 'r') as f:
            urls = [line.strip() for line in f if line.strip()]
        df = pd.DataFrame({'URL': urls})
        logging.info(f"Txt file read successfully: {file_path}, processing {len(df)} URLs")
        return df
    except Exception as e:
        logging.error(f"Error reading txt file: {e}", exc_info=True)
        raise

def read_user_pass_file(file_path):
    """Read username:password pairs from txt file"""
    try:
        user_pass_pairs = []
        with open(file_path, 'r') as f:
            for line in f:
                line = line.strip()
                if ':' in line:
                    username, password = line.split(':', 1)
                    user_pass_pairs.append((username.strip(), password.strip()))
        logging.info(f"User/pass file read successfully: {file_path}, found {len(user_pass_pairs)} pairs")
        return user_pass_pairs
    except Exception as e:
        logging.error(f"Error reading user/pass file: {e}", exc_info=True)
        raise

def check_link_status(url, session_cookies, login_url, progress_queue=None, retry_count=2, retry_timeout_multiplier=2, initial_timeout=15, verbose=False, use_browser_for_external=False):
    """Check if a link is working using requests or Playwright for external links, with caching"""
    with checked_links_lock:
        if url in checked_links:
            if verbose:
                print(f"  Using cached status for {url}: {checked_links[url]}")
            result = checked_links[url]
        else:
            result = None
    
    if result is not None:
        if progress_queue:
            progress_queue.put(1)
        return result
    
    base_url = login_url.replace('/login/', '/')
    is_external = not url.startswith(base_url)
    
    if is_external and use_browser_for_external:
        # Use Playwright for external links
        result = check_link_status_browser(url, verbose)
    else:
        # Use requests for internal links or if not using browser for external
        current_timeout = initial_timeout
        
        for attempt in range(retry_count + 1):  # +1 for the initial attempt
            try:
                is_retry = attempt > 0
                if is_retry and verbose:
                    print(f"    Retry {attempt}/{retry_count} for {url} with timeout {current_timeout}s")
                    
                headers = {
                    "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
                }
                response = requests.head(url, allow_redirects=True, timeout=current_timeout, headers=headers, cookies=session_cookies)
                
                final_url = response.url
                if final_url.startswith(login_url):
                    logging.warning(f"Link redirects to login page: {url}")
                    result = "Requires Authentication"
                    # No need to retry for authentication issues
                    break
                elif response.status_code < 400:
                    result = "OK"
                    break
                elif is_retry and attempt == retry_count:
                    # This is the last retry and it still failed
                    logging.warning(f"Broken link confirmed after {retry_count} retries: {url} with status {response.status_code}")
                    result = "Broken"
                    break
                elif is_retry:
                    # Failed again but we have more retries available
                    current_timeout = current_timeout * retry_timeout_multiplier
                    logging.info(f"Link retry {attempt}/{retry_count} failed: {url}. Will retry with longer timeout.")
                    continue
                else:
                    # First attempt failed, prepare for retry
                    logging.warning(f"Potential broken link detected: {url} with status {response.status_code}. Will retry with longer timeout.")
                    current_timeout = current_timeout * retry_timeout_multiplier
                    continue
                    
            except requests.RequestException as e:
                if attempt == retry_count:
                    # This is the last retry and it still failed
                    logging.error(f"Request exception for URL {url} after {retry_count} retries: {e}")
                    result = "Broken"
                    break
                else:
                    # Prepare for retry
                    current_timeout = current_timeout * retry_timeout_multiplier
                    logging.info(f"Request exception during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                    continue
            except Exception as e:
                if attempt == retry_count:
                    # This is the last retry and it still failed
                    logging.error(f"Unexpected error for URL {url} after {retry_count} retries: {e}", exc_info=True)
                    result = "Broken"
                    break
                else:
                    # Prepare for retry
                    current_timeout = current_timeout * retry_timeout_multiplier
                    logging.info(f"Unexpected error during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                    continue
    
    # Cache the result
    with checked_links_lock:
        checked_links[url] = result
    
    if progress_queue:
        progress_queue.put(1)
    return result

def check_link_status_browser(url, verbose=False):
    """Check if an external link works by navigating with Playwright"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        if verbose:
            print(f"  Checking external link with browser: {url}")
        
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=True)
        context = browser.new_context()
        page = context.new_page()
        
        # Navigate to the URL
        response = page.goto(url, wait_until="networkidle", timeout=30000)
        
        if response and response.ok:
            result = "OK"
        else:
            result = "Broken"
            
        if verbose:
            print(f"  Browser check result: {result}")
        
        return result
    except Exception as e:
        logging.error(f"Browser check failed for {url}: {e}")
        if verbose:
            print(f"  Browser check failed: {str(e)}")
        return "Broken"
    finally:
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def check_link_status_without_login(url, login_url, retry_count=2, retry_timeout_multiplier=2, initial_timeout=15, verbose=False):
    """Check if a link is accessible without login with retry logic"""
    current_timeout = initial_timeout
    
    for attempt in range(retry_count + 1):  # +1 for the initial attempt
        try:
            is_retry = attempt > 0
            if is_retry and verbose:
                print(f"    Retry {attempt}/{retry_count} for {url} with timeout {current_timeout}s")
                
            headers = {
                "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
            }
            response = requests.head(url, allow_redirects=True, timeout=current_timeout, headers=headers)
            
            final_url = response.url
            if final_url.startswith(login_url):
                return "Requires Authentication"
            
            if response.status_code < 400:
                return "Publicly Accessible"
            elif is_retry and attempt == retry_count:
                # This is the last retry and it still failed
                logging.warning(f"Broken link confirmed after {retry_count} retries: {url} with status {response.status_code}")
                return "Broken"
            elif is_retry:
                # Failed again but we have more retries available
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Link retry {attempt}/{retry_count} failed: {url}. Will retry with longer timeout.")
                continue
            else:
                # First attempt failed, prepare for retry
                logging.warning(f"Potential broken link detected: {url} with status {response.status_code}. Will retry with longer timeout.")
                current_timeout = current_timeout * retry_timeout_multiplier
                continue
                
        except requests.RequestException as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Request exception for URL {url} after {retry_count} retries: {e}")
                return "Broken"
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Request exception during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
        except Exception as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Unexpected error for URL {url} after {retry_count} retries: {e}", exc_info=True)
                return "Broken"
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Unexpected error during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
    
    return "Broken"  # Default if all retries fail

def extract_links(storage_state_path, parent_url, wait_time=4, verbose=False):
    """Extract links from a webpage using Playwright and BeautifulSoup"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        if verbose:
            print(f"  Opening page to extract links...")
            
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
        
        context = browser.new_context(storage_state=storage_state)
        page = context.new_page()
        
        page.goto(parent_url, wait_until="networkidle", timeout=60000)
        logging.info(f"Navigated to URL: {parent_url}")
        page.wait_for_timeout(wait_time * 1000)  # Convert seconds to milliseconds
        
        if verbose:
            print(f"  Parsing HTML content...")
            
        html_content = page.content()
        soup = BeautifulSoup(html_content, 'html.parser')
        
        main_content = soup.find(id="mhh_mcn_main")
        if not main_content:
            logging.warning(f"No main content found for URL: {parent_url}")
            if verbose:
                print(f"  No main content container found (id='mhh_mcn_main')")
            return []
        
        links_set = set()
        for a_tag in main_content.find_all('a', href=True):
            href = a_tag['href']
            if not href.startswith('#'):
                links_set.add(('Article/Product', href))
        
        for img_tag in main_content.find_all('img', src=True):
            src = img_tag['src']
            links_set.add(('Image', src))
        
        unique_links = [{'url': url, 'type': link_type} for link_type, url in links_set]
        
        logging.info(f"Extracted {len(unique_links)} unique child links from {parent_url}.")
        if verbose:
            print(f"  Extracted {len(unique_links)} unique links")
        return unique_links
    except Exception as e:
        logging.error(f"Error extracting links from {parent_url}: {e}", exc_info=True)
        if verbose:
            print(f"  Error extracting links: {str(e)}")
        return []
    finally:
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def compile_results_for_homepage(user_pass_pairs, homepage_url, login_url, wait_time, verbose=False, max_retries=2, retry_timeout_multiplier=2, use_browser_for_external=False):
    """Check homepage links for each user:pass pair using multi-threading"""
    all_results = []
    
    def process_user(user_pass):
        username, password = user_pass
        user_results = []
        logging.info(f"Processing user: {username}")
        if verbose:
            print(f"\nProcessing user: {username}")
        
        # Login for this user
        login_success, storage_state_path = login_to_portal(username, password, login_url)
        if not login_success:
            logging.error(f"Login failed for user: {username}")
            if verbose:
                print(f"  Login failed for user: {username}")
            user_results.append({
                'User': username,
                'Parent URL': homepage_url,
                'Child URL': '',
                'Link Type': '',
                'Status': 'Login Failed',
                'Notes': 'Could not login with provided credentials'
            })
            return user_results
        
        # Get session cookies
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
        
        session_cookies = {}
        for cookie in storage_state.get('cookies', []):
            session_cookies[cookie['name']] = cookie['value']
        
        # Extract and check links on homepage
        child_links = extract_links(storage_state_path, homepage_url, wait_time, verbose)
        
        if not child_links:
            if verbose:
                print(f"  No links found on homepage")
            user_results.append({
                'User': username,
                'Parent URL': homepage_url,
                'Child URL': '',
                'Link Type': '',
                'Status': 'No Links Found',
                'Notes': 'Main content container not found.'
            })
        else:
            if verbose:
                print(f"  Found {len(child_links)} links to check")
            
            # Check each link
            for link in child_links:
                link_url = link['url']
                link_type = link['type']
                
                status = check_link_status(
                    link_url, 
                    session_cookies, 
                    login_url, 
                    None,  # progress_queue
                    max_retries,
                    retry_timeout_multiplier,
                    15,  # initial timeout
                    verbose,
                    use_browser_for_external
                )
                
                user_results.append({
                    'User': username,
                    'Parent URL': homepage_url,
                    'Child URL': link_url,
                    'Link Type': link_type,
                    'Status': status,
                    'Notes': ''
                })
                
                if verbose and status != "OK":
                    print(f"    - {status}: {link_url}")
        
        # Clean up
        if storage_state_path and os.path.exists(storage_state_path):
            os.unlink(storage_state_path)
        
        return user_results
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
        future_to_user = {executor.submit(process_user, user_pass): user_pass for user_pass in user_pass_pairs}
        
        for future in as_completed(future_to_user):
            user_results = future.result()
            all_results.extend(user_results)
    
    return all_results

def compile_results(df, storage_state_path, username, password, login_url, wait_time, without_login, verbose=False, max_retries=2, retry_timeout_multiplier=2, use_browser_for_external=False):
    """Check links and compile results"""
    results = []
    reauth_attempted = False
    link_status_dict = {}
    progress_queue = Queue()
    session_cookies = {}
    
    if not without_login:
        # Get session cookies from storage state
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
            
        # Convert Playwright cookies to requests cookies format
        for cookie in storage_state.get('cookies', []):
            session_cookies[cookie['name']] = cookie['value']
    
    total_urls = len(df)
    logging.info(f"Processing {total_urls} URLs")
    if verbose:
        print(f"\n=== Processing {total_urls} URLs ===")
        if max_retries > 0:
            print(f"=== Using {max_retries} retries with timeout multiplier {retry_timeout_multiplier}x for potential false negatives ===")
    
    for index, row in df.iterrows():
        parent_url = row['URL']
        logging.info(f"Processing URL {index+1}/{total_urls}: {parent_url}")
        if verbose:
            print(f"\n[{index+1}/{total_urls}] Processing: {parent_url}")
        
        if without_login:
            if verbose:
                print(f"  Checking without login...")
            status = check_link_status_without_login(
                parent_url, 
                login_url, 
                retry_count=max_retries, 
                retry_timeout_multiplier=retry_timeout_multiplier,
                verbose=verbose
            )
            note = ""
            if status == "Publicly Accessible":
                note = "URL is accessible without authentication."
            elif status == "Requires Authentication":
                note = "URL correctly requires authentication."
            
            results.append({
                'Parent URL': parent_url,
                'Child URL': '',
                'Link Type': '',
                'Status': status,
                'Notes': note
            })
            
            if verbose:
                print(f"  Result: {status}")
        else:
            if verbose:
                print(f"  Extracting links...")
            child_links = extract_links(storage_state_path, parent_url, wait_time, verbose)

            if not child_links:
                if verbose:
                    print(f"  No links found (main content container not found)")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'No Links Found',
                    'Notes': 'Main content container not found.'
                })
                continue

            if verbose:
                print(f"  Found {len(child_links)} links to check")
                
            all_ok = True  # Initialize as True for each parent URL
            total_links = len(child_links)
            processed_links = 0
            
            # Create a thread pool with 5 workers
            with ThreadPoolExecutor(max_workers=5) as executor:
                future_to_url = {}
                
                # Submit all tasks to the executor
                for link in child_links:
                    link_url = link['url']
                    link_type = link['type']
                    
                    if link_url in link_status_dict:  # Check if link has been processed
                        processed_links += 1
                        status = link_status_dict[link_url]
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': status,
                            'Notes': ''
                        })
                        if verbose and processed_links % 5 == 0:
                            print(f"  Progress: {processed_links}/{total_links} links (using cached results)")
                        continue
                    
                    future = executor.submit(
                        check_link_status, 
                        link_url, 
                        session_cookies, 
                        login_url, 
                        progress_queue,
                        max_retries,
                        retry_timeout_multiplier,
                        15,  # Initial timeout
                        verbose,
                        use_browser_for_external
                    )
                    future_to_url[future] = (link_url, link_type)
                
                # Process completed futures as they finish
                for future in as_completed(future_to_url):
                    link_url, link_type = future_to_url[future]
                    try:
                        status = future.result()
                        
                        if status == "Requires Authentication" and not reauth_attempted:
                            # Try to re-authenticate and get new cookies
                            if verbose:
                                print(f"  Session expired. Re-authenticating...")
                            logging.info("Re-authenticating due to session expiration")
                            login_success, new_storage_state_path = login_to_portal(username, password, login_url)
                            
                            if login_success:
                                storage_state_path = new_storage_state_path
                                # Update session cookies
                                with open(storage_state_path, 'r') as f:
                                    storage_state = json.load(f)
                                
                                session_cookies.clear()
                                for cookie in storage_state.get('cookies', []):
                                    session_cookies[cookie['name']] = cookie['value']
                                
                                reauth_attempted = True
                                # Retry with new cookies
                                if verbose:
                                    print(f"  Re-authentication successful. Retrying...")
                                status = check_link_status(
                                    link_url, 
                                    session_cookies, 
                                    login_url,
                                    None,
                                    max_retries,
                                    retry_timeout_multiplier,
                                    15,  # Initial timeout
                                    verbose,
                                    use_browser_for_external
                                )
                                
                                if status == "Requires Authentication":
                                    logging.error(f"Link still redirects to login after re-authentication: {link_url}")
                                    if verbose:
                                        print(f"  Link still requires authentication after re-login: {link_url}")
                                    all_ok = False
                            else:
                                logging.error("Re-authentication failed. Cannot proceed with link checking.")
                                if verbose:
                                    print(f"  Re-authentication failed!")
                                status = "Requires Authentication"
                                all_ok = False
                        elif status == "Broken":
                            all_ok = False
                        
                        link_status_dict[link_url] = status
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': status,
                            'Notes': ''
                        })
                        
                        processed_links += 1
                        logging.info(f"Processed {processed_links}/{total_links} links for {parent_url}")
                        
                        if verbose:
                            if processed_links % 5 == 0 or processed_links == total_links:
                                print(f"  Progress: {processed_links}/{total_links} links checked")
                            if status != "OK" and status != "Publicly Accessible":
                                print(f"    - {status}: {link_url}")
                        
                    except Exception as e:
                        logging.error(f"Error processing {link_url}: {str(e)}")
                        link_status_dict[link_url] = "Error"
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': "Error",
                            'Notes': str(e)
                        })
                        all_ok = False
                        
                        processed_links += 1
                        if verbose:
                            print(f"  Error checking: {link_url}")
                            print(f"    - Error: {str(e)}")

            if all_ok:
                if verbose:
                    print(f"  Summary: All Links OK")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'All Links OK',
                    'Notes': 'Everything is working correctly.'
                })
            else:
                if verbose:
                    print(f"  Summary: Some Links Broken")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'Some Links Broken',
                    'Notes': 'One or more child links are broken.'
                })
        
        logging.info(f"Completed processing URL {index+1}/{total_urls}: {parent_url}")
        if verbose:
            print(f"Completed: {parent_url}")
            # Add a visual separator between URLs
            print("-" * 80)

    logging.info(f"Compiled results for {total_urls} URLs.")
    if verbose:
        print(f"\n=== Completed processing all {total_urls} URLs ===\n")
    return results

def generate_report(results, output_path=None):
    """Generate an Excel report from the results"""
    try:
        df_report = pd.DataFrame(results)
        df_report.sort_values(by=['User', 'Parent URL', 'Child URL'], inplace=True)
        
        if output_path:
            # Write to file
            with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
                df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
                
                workbook = writer.book
                worksheet = writer.sheets['Link Check Report']
                
                header_format = workbook.add_format({
                    'bold': True,
                    'text_wrap': True,
                    'valign': 'top',
                    'fg_color': '#D7E4BC',
                    'border': 1
                })
                
                for col_num, value in enumerate(df_report.columns.values):
                    worksheet.write(0, col_num, value, header_format)
                    if value in ['Parent URL', 'Child URL']:
                        worksheet.set_column(col_num, col_num, 50)
                    else:
                        worksheet.set_column(col_num, col_num, 20)
                
                for row_num in range(1, len(df_report) + 1):
                    if row_num % 2 == 0:
                        worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
                
                worksheet.freeze_panes(1, 0)
                worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
                
            logging.info(f"Report generated: {output_path}")
            return output_path
        else:
            # Return as buffer (for in-memory processing)
            output = BytesIO()
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
                
                workbook = writer.book
                worksheet = writer.sheets['Link Check Report']
                
                header_format = workbook.add_format({
                    'bold': True,
                    'text_wrap': True,
                    'valign': 'top',
                    'fg_color': '#D7E4BC',
                    'border': 1
                })
                
                for col_num, value in enumerate(df_report.columns.values):
                    worksheet.write(0, col_num, value, header_format)
                    if value in ['Parent URL', 'Child URL']:
                        worksheet.set_column(col_num, col_num, 50)
                    else:
                        worksheet.set_column(col_num, col_num, 20)
                
                for row_num in range(1, len(df_report) + 1):
                    if row_num % 2 == 0:
                        worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
                
                worksheet.freeze_panes(1, 0)
                worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
            
            output.seek(0)
            logging.info("Report generated in memory with basic formatting.")
            return output
    except Exception as e:
        logging.error(f"Failed to generate report: {e}", exc_info=True)
        raise

def send_email(smtp_server, smtp_port, sender_email, username, password, recipient_emails, subject, body, attachment_data=None, attachment_filename=None):
    """Send email with optional attachment (file path or in-memory data)"""
    try:
        # Create a multipart message
        msg = MIMEMultipart()
        msg['From'] = sender_email
        msg['To'] = ', '.join(recipient_emails)
        msg['Date'] = formatdate(localtime=True)
        msg['Subject'] = subject

        # Add body to email
        msg.attach(MIMEText(body, 'plain'))

        # Add attachment if provided
        if attachment_data:
            part = MIMEBase('application', 'octet-stream')
            
            if isinstance(attachment_data, BytesIO):
                # Handle in-memory attachment
                part.set_payload(attachment_data.getvalue())
            elif isinstance(attachment_data, str) and os.path.exists(attachment_data):
                # Handle file path attachment
                with open(attachment_data, "rb") as attachment:
                    part.set_payload(attachment.read())
            else:
                logging.error("Invalid attachment data format")
                return False
                
            encoders.encode_base64(part)
            
            # Set filename for attachment
            if attachment_filename:
                filename = attachment_filename
            elif isinstance(attachment_data, str):
                filename = os.path.basename(attachment_data)
            else:
                filename = f"report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
                
            part.add_header('Content-Disposition', f'attachment; filename="{filename}"')
            msg.attach(part)

        # Connect to server and send email
        with smtplib.SMTP(smtp_server, smtp_port) as server:
            if password:
                server.starttls()  # Enable security
                server.login(username or sender_email, password)
            server.sendmail(sender_email, recipient_emails, msg.as_string())
            
        logging.info(f"Email sent successfully to {', '.join(recipient_emails)}")
        return True
    except Exception as e:
        logging.error(f"Failed to send email: {e}", exc_info=True)
        return False

def main():
    parser = argparse.ArgumentParser(description='MSD Broken Link Checker - Multiuser Homepage Version')
    parser.add_argument('--user-pass-file', help='Path to txt file with username:password pairs (required unless --without-login is used)')
    parser.add_argument('--additional-urls', help='Path to Excel file or txt file with additional URLs to check with default credentials')
    parser.add_argument('--output', required=False, help='Path to save the output report (optional, for testing)')
    parser.add_argument('--without-login', action='store_true', help='Check links without login (for additional URLs)')
    parser.add_argument('--wait-time', type=int, default=4, help='Wait time in seconds after page loads')
    parser.add_argument('--default-username', help='Default username for additional URLs (overrides .env file)')
    parser.add_argument('--default-password', help='Default password for additional URLs (overrides .env file)')
    parser.add_argument('--test', type=int, help='Test mode: specify number of URLs to process (default: process all)')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output showing progress')
    parser.add_argument('--max-retries', type=int, default=2, help='Maximum number of retries for potential false negatives (default: 2)')
    parser.add_argument('--retry-timeout-multiplier', type=float, default=2.0, help='Multiplier for timeout on each retry (default: 2.0)')
    parser.add_argument('--use-browser-for-external', action='store_true', help='Use browser navigation for external links instead of HTTP requests')
    
    # Email arguments
    parser.add_argument('--send-email', action='store_true', help='Send report via email')
    parser.add_argument('--smtp-server', help='SMTP server address (overrides .env file)')
    parser.add_argument('--smtp-port', type=int, help='SMTP server port (overrides .env file)')
    parser.add_argument('--email-sender', help='Sender email address (overrides .env file)')
    parser.add_argument('--email-username', help='Username for SMTP authentication (overrides .env file)')
    parser.add_argument('--email-password', help='Sender email password (overrides .env file)')
    parser.add_argument('--email-recipients', help='Comma-separated list of recipient email addresses (overrides .env file)')
    
    args = parser.parse_args()
    
    # Load environment variables
    load_dotenv()
    
    # Determine login credentials for additional URLs
    default_username = args.default_username or os.environ.get('MSD_USERNAME')
    default_password = args.default_password or os.environ.get('MSD_PASSWORD')
    login_url = "https://profesionales.msd.es/login/"
    homepage_url = "https://profesionales.msd.es/"
    
    # Configure email settings
    send_email_flag = args.send_email or not args.output
    
    if send_email_flag:
        smtp_server = args.smtp_server or os.environ.get('SMTP_SERVER')
        smtp_port = args.smtp_port or int(os.environ.get('SMTP_PORT', 587))
        email_sender = args.email_sender or os.environ.get('EMAIL_SENDER')
        email_username = args.email_username or os.environ.get('EMAIL_USERNAME', email_sender)
        email_password = args.email_password or os.environ.get('EMAIL_PASSWORD')
        
        recipients_str = args.email_recipients or os.environ.get('EMAIL_RECIPIENTS', '')
        email_recipients = [r.strip() for r in recipients_str.split(',') if r.strip()]
        
        # Validate email settings
        if not all([smtp_server, email_sender, email_recipients]):
            logging.error("SMTP server, sender email, and recipient emails are required to send email")
            sys.exit("Error: Email settings are incomplete. Please provide all required email parameters in .env file or as arguments.")
    
    if not args.without_login and not args.user_pass_file:
        parser.error("--user-pass-file is required unless --without-login is used")
    
    if not args.without_login and (not default_username or not default_password) and args.additional_urls:
        logging.error("Default username and password are required for additional URLs login checks")
        sys.exit("Error: Default username and password are required for additional URLs login checks. "
                 "Set them in .env file or provide as arguments.")
    
    try:
        # Show test mode notice if enabled
        if args.test:
            if args.verbose:
                print(f"\n==================================================")
                print(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
                print(f"==================================================\n")
            else:
                print(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
            logging.info(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
            
        # Read user:pass pairs (only if provided)
        user_pass_pairs = []
        if args.user_pass_file:
            if args.verbose:
                print(f"Reading user:pass file: {args.user_pass_file}")
            user_pass_pairs = read_user_pass_file(args.user_pass_file)
        
        # Check homepage for each user (only if user-pass file provided)
        homepage_results = []
        if user_pass_pairs:
            if args.verbose:
                print(f"Checking homepage for {len(user_pass_pairs)} users...")
            homepage_results = compile_results_for_homepage(
                user_pass_pairs, 
                homepage_url, 
                login_url, 
                args.wait_time, 
                args.verbose,
                args.max_retries,
                args.retry_timeout_multiplier,
                args.use_browser_for_external
            )
        
        all_results = homepage_results
        
        # Check additional URLs if provided
        additional_urls_checked = 0
        if args.additional_urls:
            if args.verbose:
                print(f"\nReading additional URLs file: {args.additional_urls}")
            
            if args.additional_urls.endswith('.xlsx') or args.additional_urls.endswith('.xls'):
                df_additional = read_excel(args.additional_urls, args.test)
            else:
                df_additional = read_txt_urls(args.additional_urls)
            
            additional_urls_checked = len(df_additional)
            
            # Create output directory if output path is provided
            if args.output:
                output_dir = os.path.dirname(os.path.abspath(args.output))
                os.makedirs(output_dir, exist_ok=True)
                if args.verbose:
                    print(f"Output directory created/verified: {output_dir}")
            
            storage_state_path = None
            
            if not args.without_login:
                # Login with default credentials
                if args.verbose:
                    print(f"\nLogging in with default credentials for additional URLs...")
                login_success, storage_state_path = login_to_portal(default_username, default_password, login_url)
                if not login_success:
                    logging.error("Default login failed. Skipping authenticated checks for additional URLs. Proceeding with homepage results only.")
                    if args.verbose:
                        print("Default login failed. Skipping additional URLs.")
                elif args.verbose:
                    print(f"Default login successful. Session state saved.")
            
            # Compile results for additional URLs only if login succeeded (or without login)
            if args.without_login or login_success:
                additional_results = compile_results(
                    df_additional, 
                    storage_state_path, 
                    default_username, 
                    default_password, 
                    login_url, 
                    args.wait_time, 
                    args.without_login,
                    args.verbose,
                    args.max_retries,
                    args.retry_timeout_multiplier,
                    args.use_browser_for_external
                )
                
                # Add User column to additional results
                for result in additional_results:
                    result['User'] = 'Default'
                
                all_results.extend(additional_results)
            else:
                additional_urls_checked = 0  # Reset since not checked
            
            # Clean up
            if storage_state_path and os.path.exists(storage_state_path):
                os.unlink(storage_state_path)
                logging.info(f"Cleaned up temporary storage state file: {storage_state_path}")
                if args.verbose:
                    print(f"Cleaned up temporary session state file")
        
        # Generate report (in memory or file based on args)
        if args.verbose:
            print(f"\nGenerating report...")
            
        if args.output:
            report_path = generate_report(all_results, args.output)
            logging.info(f"Report saved to: {report_path}")
            print(f"Report saved to: {report_path}")
            report_data = report_path  # Use file path for email attachment
        else:
            # Generate in-memory report
            report_data = generate_report(all_results)
            logging.info("Report generated in memory")
            if args.verbose:
                print(f"Report generated in memory (not saved to disk)")
        
        # Send email 
        if send_email_flag:
            if args.verbose:
                print(f"\nPreparing email...")
                
            # Determine if it's with or without login for subject line
            check_type = "Public Access Link Check" if args.without_login else "Authenticated Link Check"
            
            # Add test mode indicator to subject if in test mode
            test_prefix = "[TEST] " if args.test else ""
            
            # Prepare email content
            subject = f"{test_prefix}MSD Multiuser Homepage {check_type} Report - {datetime.datetime.now().strftime('%Y-%m-%d')}"
            
            # Count results by status
            df_results = pd.DataFrame(all_results)
            status_counts = df_results['Status'].value_counts().to_dict()
            
            # Create email body
            body = f"""
MSD Multiuser Homepage Broken Link Checker Report
{f'[TEST MODE - LIMITED TO {args.test} URLs]' if args.test else ''}

Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Mode: {'Public Access (No Login)' if args.without_login else 'Authenticated Access'}
Homepage users checked: {len(user_pass_pairs)}
Additional URLs checked: {additional_urls_checked}
Retry settings: {args.max_retries} retries with {args.retry_timeout_multiplier}x timeout multiplier

Summary:
"""
            for status, count in status_counts.items():
                body += f"- {status}: {count}\n"
            
            if "Broken" in status_counts:
                body += f"\nWarning: {status_counts.get('Broken', 0)} broken links found!\n"
                
            body += "\nPlease see the attached Excel report for details."
            
            # Prepare attachment filename
            filename = None
            if args.output:
                filename = os.path.basename(args.output)
            else:
                test_indicator = "test_" if args.test else ""
                filename = f"msd_multiuser_link_check_{test_indicator}report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
            
            # Send email
            if args.verbose:
                print(f"Sending email to {', '.join(email_recipients)}")
                
            email_success = send_email(
                smtp_server,
                smtp_port,
                email_sender,
                email_username,
                email_password,
                email_recipients,
                subject,
                body,
                report_data,
                filename
            )
            
            if email_success:
                logging.info("Email sent successfully with the report.")
                print("Email sent successfully with the report.")
            else:
                logging.error("Failed to send email with the report.")
                print("Failed to send email with the report.")
                
                # If email fails and we didn't save to file, let's save it somewhere so the results aren't lost
                if not args.output:
                    test_indicator = "test_" if args.test else ""
                    fallback_path = f"msd_multiuser_link_check_{test_indicator}report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
                    if isinstance(report_data, BytesIO):
                        with open(fallback_path, 'wb') as f:
                            f.write(report_data.getvalue())
                        logging.info(f"Email failed, saved report to: {fallback_path}")
                        print(f"Email failed, saved report to: {fallback_path}")
        
        if args.verbose:
            print(f"\n==================================================")
            print(f"Process completed successfully!")
            print(f"==================================================\n")
            
        return 0
    except Exception as e:
        logging.error(f"Error in main function: {e}", exc_info=True)
        sys.exit(f"Error: {e}")

if __name__ == "__main__":
    sys.exit(main())
