#!/usr/bin/env python3
import os
import sys
import argparse
import pandas as pd
import time
import logging
import requests
from bs4 import BeautifulSoup
import datetime
import json
import tempfile
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
import xlsxwriter
from io import BytesIO
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email.utils import formatdate
from email import encoders

# Setup logging
log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'link_checker_daemon.log')
logging.basicConfig(
    filename=log_file,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logging.info("Starting app_daemon.py with log file: %s", log_file)

def initialize_playwright(headless=True):
    """Initialize Playwright and launch browser"""
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=headless)
    logging.info("Initialized Playwright and launched browser")
    return playwright, browser

def login_to_portal(username, password, login_url, headless=True):
    """Login to the portal and save storage state to a temp file for reuse"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        logging.info("Attempting login and saving session state...")
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=headless)
        context = browser.new_context()
        page = context.new_page()
        
        logging.info("Navigating to login page...")
        page.goto(login_url)
        time.sleep(1)
        
        # Accept cookies if present
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        # Enter username
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        # Click next
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        # Re-enter username and password
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        # Submit
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        # Wait for login confirmation
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
        
        return True, storage_state_path
    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        return False, None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_excel(file_path, test_limit=None):
    """Read URLs from Excel file"""
    try:
        df = pd.read_excel(file_path)
        if 'URL' not in df.columns:
            raise KeyError("Excel file does not contain 'URL' column")
        
        # If test limit is set, only return the first N rows
        if test_limit and test_limit > 0:
            logging.info(f"TEST MODE: Limiting to first {test_limit} URLs")
            df = df.head(test_limit)
            
        logging.info(f"Excel file read successfully: {file_path}, processing {len(df)} URLs")
        return df
    except Exception as e:
        logging.error(f"Error reading Excel file: {e}", exc_info=True)
        raise

def check_link_status(url, session_cookies, login_url, progress_queue=None, retry_count=2, retry_timeout_multiplier=2, initial_timeout=15, verbose=False):
    """Check if a link is working using requests with retry logic for potential false negatives"""
    current_timeout = initial_timeout
    
    for attempt in range(retry_count + 1):  # +1 for the initial attempt
        try:
            is_retry = attempt > 0
            if is_retry and verbose:
                print(f"    Retry {attempt}/{retry_count} for {url} with timeout {current_timeout}s")
                
            headers = {
                "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
            }
            response = requests.head(url, allow_redirects=True, timeout=current_timeout, headers=headers, cookies=session_cookies)
            
            final_url = response.url
            if final_url.startswith(login_url):
                logging.warning(f"Link redirects to login page: {url}")
                result = "Requires Authentication"
                # No need to retry for authentication issues
                break
            elif response.status_code < 400:
                result = "OK"
                break
            elif is_retry and attempt == retry_count:
                # This is the last retry and it still failed
                logging.warning(f"Broken link confirmed after {retry_count} retries: {url} with status {response.status_code}")
                result = "Broken"
                break
            elif is_retry:
                # Failed again but we have more retries available
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Link retry {attempt}/{retry_count} failed: {url}. Will retry with longer timeout.")
                continue
            else:
                # First attempt failed, prepare for retry
                logging.warning(f"Potential broken link detected: {url} with status {response.status_code}. Will retry with longer timeout.")
                current_timeout = current_timeout * retry_timeout_multiplier
                continue
                
        except requests.RequestException as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Request exception for URL {url} after {retry_count} retries: {e}")
                result = "Broken"
                break
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Request exception during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
        except Exception as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Unexpected error for URL {url} after {retry_count} retries: {e}", exc_info=True)
                result = "Broken"
                break
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Unexpected error during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
    
    if progress_queue:
        progress_queue.put(1)
    return result

def check_link_status_without_login(url, login_url, retry_count=2, retry_timeout_multiplier=2, initial_timeout=15, verbose=False):
    """Check if a link is accessible without login with retry logic"""
    current_timeout = initial_timeout
    
    for attempt in range(retry_count + 1):  # +1 for the initial attempt
        try:
            is_retry = attempt > 0
            if is_retry and verbose:
                print(f"    Retry {attempt}/{retry_count} for {url} with timeout {current_timeout}s")
                
            headers = {
                "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
            }
            response = requests.head(url, allow_redirects=True, timeout=current_timeout, headers=headers)
            
            final_url = response.url
            if final_url.startswith(login_url):
                return "Requires Authentication"
            
            if response.status_code < 400:
                return "Publicly Accessible"
            elif is_retry and attempt == retry_count:
                # This is the last retry and it still failed
                logging.warning(f"Broken link confirmed after {retry_count} retries: {url} with status {response.status_code}")
                return "Broken"
            elif is_retry:
                # Failed again but we have more retries available
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Link retry {attempt}/{retry_count} failed: {url}. Will retry with longer timeout.")
                continue
            else:
                # First attempt failed, prepare for retry
                logging.warning(f"Potential broken link detected: {url} with status {response.status_code}. Will retry with longer timeout.")
                current_timeout = current_timeout * retry_timeout_multiplier
                continue
                
        except requests.RequestException as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Request exception for URL {url} after {retry_count} retries: {e}")
                return "Broken"
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Request exception during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
        except Exception as e:
            if attempt == retry_count:
                # This is the last retry and it still failed
                logging.error(f"Unexpected error for URL {url} after {retry_count} retries: {e}", exc_info=True)
                return "Broken"
            else:
                # Prepare for retry
                current_timeout = current_timeout * retry_timeout_multiplier
                logging.info(f"Unexpected error during attempt {attempt+1}/{retry_count+1} for URL {url}: {e}. Will retry.")
                continue
    
    return "Broken"  # Default if all retries fail

def extract_links(storage_state_path, parent_url, wait_time=4, verbose=False):
    """Extract links from a webpage using Playwright and BeautifulSoup"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        if verbose:
            print(f"  Opening page to extract links...")
            
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
        
        context = browser.new_context(storage_state=storage_state)
        page = context.new_page()
        
        page.goto(parent_url, wait_until="networkidle", timeout=60000)
        logging.info(f"Navigated to URL: {parent_url}")
        page.wait_for_timeout(wait_time * 1000)  # Convert seconds to milliseconds
        
        if verbose:
            print(f"  Parsing HTML content...")
            
        html_content = page.content()
        soup = BeautifulSoup(html_content, 'html.parser')
        
        main_content = soup.find(id="mhh_mcn_main")
        if not main_content:
            logging.warning(f"No main content found for URL: {parent_url}")
            if verbose:
                print(f"  No main content container found (id='mhh_mcn_main')")
            return []
        
        links_set = set()
        for a_tag in main_content.find_all('a', href=True):
            href = a_tag['href']
            if not href.startswith('#'):
                links_set.add(('Article/Product', href))
        
        for img_tag in main_content.find_all('img', src=True):
            src = img_tag['src']
            links_set.add(('Image', src))
        
        unique_links = [{'url': url, 'type': link_type} for link_type, url in links_set]
        
        logging.info(f"Extracted {len(unique_links)} unique child links from {parent_url}.")
        if verbose:
            print(f"  Extracted {len(unique_links)} unique links")
        return unique_links
    except Exception as e:
        logging.error(f"Error extracting links from {parent_url}: {e}", exc_info=True)
        if verbose:
            print(f"  Error extracting links: {str(e)}")
        return []
    finally:
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def compile_results(df, storage_state_path, username, password, login_url, wait_time, without_login, verbose=False, max_retries=2, retry_timeout_multiplier=2):
    """Check links and compile results"""
    results = []
    reauth_attempted = False
    link_status_dict = {}
    progress_queue = Queue()
    session_cookies = {}
    
    if not without_login:
        # Get session cookies from storage state
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
            
        # Convert Playwright cookies to requests cookies format
        for cookie in storage_state.get('cookies', []):
            session_cookies[cookie['name']] = cookie['value']
    
    total_urls = len(df)
    logging.info(f"Processing {total_urls} URLs")
    if verbose:
        print(f"\n=== Processing {total_urls} URLs ===")
        if max_retries > 0:
            print(f"=== Using {max_retries} retries with timeout multiplier {retry_timeout_multiplier}x for potential false negatives ===")
    
    for index, row in df.iterrows():
        parent_url = row['URL']
        logging.info(f"Processing URL {index+1}/{total_urls}: {parent_url}")
        if verbose:
            print(f"\n[{index+1}/{total_urls}] Processing: {parent_url}")
        
        if without_login:
            if verbose:
                print(f"  Checking without login...")
            status = check_link_status_without_login(
                parent_url, 
                login_url, 
                retry_count=max_retries, 
                retry_timeout_multiplier=retry_timeout_multiplier,
                verbose=verbose
            )
            note = ""
            if status == "Publicly Accessible":
                note = "URL is accessible without authentication."
            elif status == "Requires Authentication":
                note = "URL correctly requires authentication."
            
            results.append({
                'Parent URL': parent_url,
                'Child URL': '',
                'Link Type': '',
                'Status': status,
                'Notes': note
            })
            
            if verbose:
                print(f"  Result: {status}")
        else:
            if verbose:
                print(f"  Extracting links...")
            child_links = extract_links(storage_state_path, parent_url, wait_time, verbose)

            if not child_links:
                if verbose:
                    print(f"  No links found (main content container not found)")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'No Links Found',
                    'Notes': 'Main content container not found.'
                })
                continue

            if verbose:
                print(f"  Found {len(child_links)} links to check")
                
            all_ok = True  # Initialize as True for each parent URL
            total_links = len(child_links)
            processed_links = 0
            
            # Create a thread pool with 5 workers
            with ThreadPoolExecutor(max_workers=5) as executor:
                future_to_url = {}
                
                # Submit all tasks to the executor
                for link in child_links:
                    link_url = link['url']
                    link_type = link['type']
                    
                    if link_url in link_status_dict:  # Check if link has been processed
                        processed_links += 1
                        status = link_status_dict[link_url]
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': status,
                            'Notes': ''
                        })
                        if verbose and processed_links % 5 == 0:
                            print(f"  Progress: {processed_links}/{total_links} links (using cached results)")
                        continue
                    
                    future = executor.submit(
                        check_link_status, 
                        link_url, 
                        session_cookies, 
                        login_url, 
                        progress_queue,
                        max_retries,
                        retry_timeout_multiplier,
                        15,  # Initial timeout
                        verbose
                    )
                    future_to_url[future] = (link_url, link_type)
                
                # Process completed futures as they finish
                for future in as_completed(future_to_url):
                    link_url, link_type = future_to_url[future]
                    try:
                        status = future.result()
                        
                        if status == "Requires Authentication" and not reauth_attempted:
                            # Try to re-authenticate and get new cookies
                            if verbose:
                                print(f"  Session expired. Re-authenticating...")
                            logging.info("Re-authenticating due to session expiration")
                            login_success, new_storage_state_path = login_to_portal(username, password, login_url)
                            
                            if login_success:
                                storage_state_path = new_storage_state_path
                                # Update session cookies
                                with open(storage_state_path, 'r') as f:
                                    storage_state = json.load(f)
                                
                                session_cookies.clear()
                                for cookie in storage_state.get('cookies', []):
                                    session_cookies[cookie['name']] = cookie['value']
                                
                                reauth_attempted = True
                                # Retry with new cookies
                                if verbose:
                                    print(f"  Re-authentication successful. Retrying...")
                                status = check_link_status(
                                    link_url, 
                                    session_cookies, 
                                    login_url,
                                    None,
                                    max_retries,
                                    retry_timeout_multiplier,
                                    15,  # Initial timeout
                                    verbose
                                )
                                
                                if status == "Requires Authentication":
                                    logging.error(f"Link still redirects to login after re-authentication: {link_url}")
                                    if verbose:
                                        print(f"  Link still requires authentication after re-login: {link_url}")
                                    all_ok = False
                            else:
                                logging.error("Re-authentication failed. Cannot proceed with link checking.")
                                if verbose:
                                    print(f"  Re-authentication failed!")
                                status = "Requires Authentication"
                                all_ok = False
                        elif status == "Broken":
                            all_ok = False
                        
                        link_status_dict[link_url] = status
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': status,
                            'Notes': ''
                        })
                        
                        processed_links += 1
                        logging.info(f"Processed {processed_links}/{total_links} links for {parent_url}")
                        
                        if verbose:
                            if processed_links % 5 == 0 or processed_links == total_links:
                                print(f"  Progress: {processed_links}/{total_links} links checked")
                            if status != "OK" and status != "Publicly Accessible":
                                print(f"    - {status}: {link_url}")
                        
                    except Exception as e:
                        logging.error(f"Error processing {link_url}: {str(e)}")
                        link_status_dict[link_url] = "Error"
                        results.append({
                            'Parent URL': parent_url,
                            'Child URL': link_url,
                            'Link Type': link_type,
                            'Status': "Error",
                            'Notes': str(e)
                        })
                        all_ok = False
                        
                        processed_links += 1
                        if verbose:
                            print(f"  Error checking: {link_url}")
                            print(f"    - Error: {str(e)}")

            if all_ok:
                if verbose:
                    print(f"  Summary: All Links OK")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'All Links OK',
                    'Notes': 'Everything is working correctly.'
                })
            else:
                if verbose:
                    print(f"  Summary: Some Links Broken")
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'Some Links Broken',
                    'Notes': 'One or more child links are broken.'
                })
        
        logging.info(f"Completed processing URL {index+1}/{total_urls}: {parent_url}")
        if verbose:
            print(f"Completed: {parent_url}")
            # Add a visual separator between URLs
            print("-" * 80)

    logging.info(f"Compiled results for {total_urls} URLs.")
    if verbose:
        print(f"\n=== Completed processing all {total_urls} URLs ===\n")
    return results

def generate_report(results, output_path=None):
    """Generate an Excel report from the results"""
    try:
        df_report = pd.DataFrame(results)
        df_report.sort_values(by=['Parent URL', 'Child URL'], inplace=True)
        
        if output_path:
            # Write to file
            with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
                df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
                
                workbook = writer.book
                worksheet = writer.sheets['Link Check Report']
                
                header_format = workbook.add_format({
                    'bold': True,
                    'text_wrap': True,
                    'valign': 'top',
                    'fg_color': '#D7E4BC',
                    'border': 1
                })
                
                for col_num, value in enumerate(df_report.columns.values):
                    worksheet.write(0, col_num, value, header_format)
                    if value in ['Parent URL', 'Child URL']:
                        worksheet.set_column(col_num, col_num, 50)
                    else:
                        worksheet.set_column(col_num, col_num, 20)
                
                for row_num in range(1, len(df_report) + 1):
                    if row_num % 2 == 0:
                        worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
                
                worksheet.freeze_panes(1, 0)
                worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
                
            logging.info(f"Report generated: {output_path}")
            return output_path
        else:
            # Return as buffer (for in-memory processing)
            output = BytesIO()
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
                
                workbook = writer.book
                worksheet = writer.sheets['Link Check Report']
                
                header_format = workbook.add_format({
                    'bold': True,
                    'text_wrap': True,
                    'valign': 'top',
                    'fg_color': '#D7E4BC',
                    'border': 1
                })
                
                for col_num, value in enumerate(df_report.columns.values):
                    worksheet.write(0, col_num, value, header_format)
                    if value in ['Parent URL', 'Child URL']:
                        worksheet.set_column(col_num, col_num, 50)
                    else:
                        worksheet.set_column(col_num, col_num, 20)
                
                for row_num in range(1, len(df_report) + 1):
                    if row_num % 2 == 0:
                        worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
                
                worksheet.freeze_panes(1, 0)
                worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
            
            output.seek(0)
            logging.info("Report generated in memory with basic formatting.")
            return output
    except Exception as e:
        logging.error(f"Failed to generate report: {e}", exc_info=True)
        raise

def send_email(smtp_server, smtp_port, sender_email, username, password, recipient_emails, subject, body, attachment_data=None, attachment_filename=None):
    """Send email with optional attachment (file path or in-memory data)"""
    try:
        # Create a multipart message
        msg = MIMEMultipart()
        msg['From'] = sender_email
        msg['To'] = ', '.join(recipient_emails)
        msg['Date'] = formatdate(localtime=True)
        msg['Subject'] = subject

        # Add body to email
        msg.attach(MIMEText(body, 'plain'))

        # Add attachment if provided
        if attachment_data:
            part = MIMEBase('application', 'octet-stream')
            
            if isinstance(attachment_data, BytesIO):
                # Handle in-memory attachment
                part.set_payload(attachment_data.getvalue())
            elif isinstance(attachment_data, str) and os.path.exists(attachment_data):
                # Handle file path attachment
                with open(attachment_data, "rb") as attachment:
                    part.set_payload(attachment.read())
            else:
                logging.error("Invalid attachment data format")
                return False
                
            encoders.encode_base64(part)
            
            # Set filename for attachment
            if attachment_filename:
                filename = attachment_filename
            elif isinstance(attachment_data, str):
                filename = os.path.basename(attachment_data)
            else:
                filename = f"report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
                
            part.add_header('Content-Disposition', f'attachment; filename="{filename}"')
            msg.attach(part)

        # Connect to server and send email
        with smtplib.SMTP(smtp_server, smtp_port) as server:
            if password:
                server.starttls()  # Enable security
                server.login(username or sender_email, password)
            server.sendmail(sender_email, recipient_emails, msg.as_string())
            
        logging.info(f"Email sent successfully to {', '.join(recipient_emails)}")
        return True
    except Exception as e:
        logging.error(f"Failed to send email: {e}", exc_info=True)
        return False

def main():
    parser = argparse.ArgumentParser(description='MSD Broken Link Checker')
    parser.add_argument('--excel', required=True, help='Path to Excel file with URLs')
    parser.add_argument('--output', required=False, help='Path to save the output report (optional, for testing)')
    parser.add_argument('--without-login', action='store_true', help='Check links without login')
    parser.add_argument('--wait-time', type=int, default=4, help='Wait time in seconds after page loads')
    parser.add_argument('--username', help='Username for login (overrides .env file)')
    parser.add_argument('--password', help='Password for login (overrides .env file)')
    parser.add_argument('--test', type=int, help='Test mode: specify number of URLs to process (default: process all)')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output showing progress')
    parser.add_argument('--max-retries', type=int, default=2, help='Maximum number of retries for potential false negatives (default: 2)')
    parser.add_argument('--retry-timeout-multiplier', type=float, default=2.0, help='Multiplier for timeout on each retry (default: 2.0)')
    
    # Email arguments
    parser.add_argument('--send-email', action='store_true', help='Send report via email')
    parser.add_argument('--smtp-server', help='SMTP server address (overrides .env file)')
    parser.add_argument('--smtp-port', type=int, help='SMTP server port (overrides .env file)')
    parser.add_argument('--email-sender', help='Sender email address (overrides .env file)')
    parser.add_argument('--email-username', help='Username for SMTP authentication (overrides .env file)')
    parser.add_argument('--email-password', help='Sender email password (overrides .env file)')
    parser.add_argument('--email-recipients', help='Comma-separated list of recipient email addresses (overrides .env file)')
    
    args = parser.parse_args()
    
    # Load environment variables
    load_dotenv()
    
    # Determine login credentials
    username = args.username or os.environ.get('MSD_USERNAME')
    password = args.password or os.environ.get('MSD_PASSWORD')
    login_url = "https://profesionales.msd.es/login/"
    
    # Configure email settings
    send_email_flag = args.send_email or not args.output
    
    if send_email_flag:
        smtp_server = args.smtp_server or os.environ.get('SMTP_SERVER')
        smtp_port = args.smtp_port or int(os.environ.get('SMTP_PORT', 587))
        email_sender = args.email_sender or os.environ.get('EMAIL_SENDER')
        email_username = args.email_username or os.environ.get('EMAIL_USERNAME', email_sender)
        email_password = args.email_password or os.environ.get('EMAIL_PASSWORD')
        
        recipients_str = args.email_recipients or os.environ.get('EMAIL_RECIPIENTS', '')
        email_recipients = [r.strip() for r in recipients_str.split(',') if r.strip()]
        
        # Validate email settings
        if not all([smtp_server, email_sender, email_recipients]):
            logging.error("SMTP server, sender email, and recipient emails are required to send email")
            sys.exit("Error: Email settings are incomplete. Please provide all required email parameters in .env file or as arguments.")
    
    if not args.without_login and (not username or not password):
        logging.error("Username and password are required for login checks")
        sys.exit("Error: Username and password are required for login checks. "
                 "Set them in .env file or provide as arguments.")
    
    try:
        # Show test mode notice if enabled
        if args.test:
            if args.verbose:
                print(f"\n==================================================")
                print(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
                print(f"==================================================\n")
            else:
                print(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
            logging.info(f"TEST MODE ENABLED: Processing only the first {args.test} URLs")
            
        # Read Excel file (with test limit if specified)
        if args.verbose:
            print(f"Reading Excel file: {args.excel}")
        df_input = read_excel(args.excel, args.test)
        
        # Create output directory if output path is provided
        if args.output:
            output_dir = os.path.dirname(os.path.abspath(args.output))
            os.makedirs(output_dir, exist_ok=True)
            if args.verbose:
                print(f"Output directory created/verified: {output_dir}")
        
        storage_state_path = None
        
        if not args.without_login:
            # Login and get storage state
            if args.verbose:
                print(f"\nLogging in to {login_url}...")
            login_success, storage_state_path = login_to_portal(username, password, login_url)
            if not login_success:
                logging.error("Login failed. Cannot proceed with authenticated checks.")
                sys.exit("Error: Login failed. Check credentials and try again.")
            elif args.verbose:
                print(f"Login successful. Session state saved.")
        
        # Compile results with retry settings
        results = compile_results(
            df_input, 
            storage_state_path, 
            username, 
            password, 
            login_url, 
            args.wait_time, 
            args.without_login,
            args.verbose,
            args.max_retries,
            args.retry_timeout_multiplier
        )
        
        # Generate report (in memory or file based on args)
        if args.verbose:
            print(f"\nGenerating report...")
            
        if args.output:
            report_path = generate_report(results, args.output)
            logging.info(f"Report saved to: {report_path}")
            print(f"Report saved to: {report_path}")
            report_data = report_path  # Use file path for email attachment
        else:
            # Generate in-memory report
            report_data = generate_report(results)
            logging.info("Report generated in memory")
            if args.verbose:
                print(f"Report generated in memory (not saved to disk)")
        
        # Send email 
        if send_email_flag:
            if args.verbose:
                print(f"\nPreparing email...")
                
            # Determine if it's with or without login for subject line
            check_type = "Public Access Link Check" if args.without_login else "Authenticated Link Check"
            
            # Add test mode indicator to subject if in test mode
            test_prefix = "[TEST] " if args.test else ""
            
            # Prepare email content
            subject = f"{test_prefix}MSD {check_type} Report - {datetime.datetime.now().strftime('%Y-%m-%d')}"
            
            # Count results by status
            df_results = pd.DataFrame(results)
            status_counts = df_results['Status'].value_counts().to_dict()
            
            # Create email body
            body = f"""
MSD Broken Link Checker Report
{f'[TEST MODE - LIMITED TO {args.test} URLs]' if args.test else ''}

Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Mode: {'Public Access (No Login)' if args.without_login else 'Authenticated Access'}
URLs checked: {len(df_input)}
Retry settings: {args.max_retries} retries with {args.retry_timeout_multiplier}x timeout multiplier

Summary:
"""
            for status, count in status_counts.items():
                body += f"- {status}: {count}\n"
            
            if "Broken" in status_counts:
                body += f"\nWarning: {status_counts.get('Broken', 0)} broken links found!\n"
                
            body += "\nPlease see the attached Excel report for details."
            
            # Prepare attachment filename
            filename = None
            if args.output:
                filename = os.path.basename(args.output)
            else:
                test_indicator = "test_" if args.test else ""
                filename = f"msd_link_check_{test_indicator}report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
            
            # Send email
            if args.verbose:
                print(f"Sending email to {', '.join(email_recipients)}")
                
            email_success = send_email(
                smtp_server,
                smtp_port,
                email_sender,
                email_username,
                email_password,
                email_recipients,
                subject,
                body,
                report_data,
                filename
            )
            
            if email_success:
                logging.info("Email sent successfully with the report.")
                print("Email sent successfully with the report.")
            else:
                logging.error("Failed to send email with the report.")
                print("Failed to send email with the report.")
                
                # If email fails and we didn't save to file, let's save it somewhere so the results aren't lost
                if not args.output:
                    test_indicator = "test_" if args.test else ""
                    fallback_path = f"msd_link_check_{test_indicator}report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
                    if isinstance(report_data, BytesIO):
                        with open(fallback_path, 'wb') as f:
                            f.write(report_data.getvalue())
                        logging.info(f"Email failed, saved report to: {fallback_path}")
                        print(f"Email failed, saved report to: {fallback_path}")
        
        # Clean up temporary files
        if storage_state_path and os.path.exists(storage_state_path):
            os.unlink(storage_state_path)
            logging.info(f"Cleaned up temporary storage state file: {storage_state_path}")
            if args.verbose:
                print(f"Cleaned up temporary session state file")
        
        if args.verbose:
            print(f"\n==================================================")
            print(f"Process completed successfully!")
            print(f"==================================================\n")
            
        return 0
    except Exception as e:
        logging.error(f"Error in main function: {e}", exc_info=True)
        sys.exit(f"Error: {e}")

if __name__ == "__main__":
    sys.exit(main())
