import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import os
import logging
import xlsxwriter
from tqdm import tqdm
import os
from datetime import datetime

# Load environment variables
load_dotenv()

OPENAI_KEY = os.getenv('OPENAI_KEY')
WEB_USERNAME = os.getenv('WEB_USERNAME')
WEB_PASSWORD = os.getenv('WEB_PASSWORD')

# Configuration variables
EXCEL_INPUT_PATH = os.getenv('EXCEL_INPUT_PATH')
EXCEL_OUTPUT_PATH = os.getenv('EXCEL_OUTPUT_PATH')
WAIT_TIME = 2  # seconds
ELEMENT_WAIT_TIME = 10  # seconds
LOGIN_URL = "https://profesionales.msd.es/login/"

# Setup logging
logging.basicConfig(
    filename='link_checker.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

# Initialize Selenium WebDriver
def initialize_driver():
    options = webdriver.ChromeOptions()
    # Uncomment the next line to run Chrome in headless mode
    # options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)
    logging.info("Initialized Selenium WebDriver.")
    return driver

def accept_cookies(driver):
    try:
        accept_button = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        accept_button.click()
        logging.info("Accepted cookies.")
        time.sleep(2)
    except Exception as e:
        logging.info("No cookies acceptance button found or already accepted.")

def login_to_portal(driver, username, password):
    try:
        driver.get(LOGIN_URL)
        logging.info("Navigated to login page.")
        time.sleep(2)
        accept_cookies(driver)
        
        # Enter username and click "Siguiente"
        username_field = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_username"))
        )
        username_field.clear()
        username_field.send_keys(username)
        logging.info("Entered username.")
        
        next_button = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.element_to_be_clickable((By.ID, "buttonNext_signInFull"))
        )
        next_button.click()
        logging.info("Clicked 'Siguiente' button.")
        time.sleep(10)
        
        # Enter username again and password
        username_field_retry = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_signInUsername"))
        )
        username_field_retry.clear()
        username_field_retry.send_keys(username)
        logging.info("Re-entered username.")
        
        password_field = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_currentPassword"))
        )
        password_field.clear()
        password_field.send_keys(password)
        logging.info("Entered password.")
        
        submit_button = WebDriverWait(driver, ELEMENT_WAIT_TIME).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@type='submit' and contains(., 'Acceda')]"))
        )
        submit_button.click()
        logging.info("Clicked 'Acceda' button.")
        time.sleep(5)
        
        # Check for login errors
        try:
            error_message = driver.find_element(By.CSS_SELECTOR, ".error-message-selector")  # Update with the actual selector for the error message
            if error_message:
                logging.error("Login failed: Incorrect username or password.")
                driver.quit()
                return False
        except:
            logging.info("Logged in successfully.")
            return True
    except Exception as e:
        logging.error("An error occurred during login:", exc_info=True)
        driver.quit()
        return False

def read_excel(file_path):
    try:
        df = pd.read_excel(file_path)
        logging.info(f"Excel file '{file_path}' read successfully.")
        return df
    except FileNotFoundError:
        logging.error(f"Excel file '{file_path}' not found.")
        raise
    except Exception as e:
        logging.error(f"Error reading Excel file '{file_path}':", exc_info=True)
        raise

def get_session_cookies(driver):
    selenium_cookies = driver.get_cookies()
    session_cookies = {}
    for cookie in selenium_cookies:
        session_cookies[cookie['name']] = cookie['value']
    logging.info("Extracted cookies from Selenium WebDriver.")
    return session_cookies

def check_link_status(url, session_cookies):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
        }
        response = requests.head(url, allow_redirects=True, timeout=15, headers=headers, cookies=session_cookies)
        
        final_url = response.url
        if final_url.startswith(LOGIN_URL):
            logging.warning(f"Link redirects to login page: {url}")
            return "Requires Authentication"
        
        if response.status_code < 400:
            return "OK"
        else:
            logging.warning(f"Broken link detected: {url} with status {response.status_code}")
            return "Broken"
    except requests.RequestException as e:
        logging.error(f"Request exception for URL {url}: {e}")
        return "Broken"
    except Exception as e:
        logging.error(f"Unexpected error for URL {url}: {e}", exc_info=True)
        return "Broken"

def re_authenticate(driver, username, password):
    logging.info("Attempting to re-authenticate...")
    login_success = login_to_portal(driver, username, password)
    if login_success:
        logging.info("Re-authentication successful.")
        return True
    else:
        logging.error("Re-authentication failed.")
        return False

def extract_links(driver, parent_url):
    try:
        driver.get(parent_url)
        logging.info(f"Navigated to URL: {parent_url}")
        time.sleep(WAIT_TIME)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        main_content = soup.find(id="mhh_mcn_main")
        if not main_content:
            logging.warning(f"No main content found for URL: {parent_url}")
            return []
        
        links_set = set()  # Use a set to store unique links
        # Extract article and product links
        for a_tag in main_content.find_all('a', href=True):
            href = a_tag['href']
            links_set.add(('Article/Product', href))
        
        # Extract image links
        for img_tag in main_content.find_all('img', src=True):
            src = img_tag['src']
            links_set.add(('Image', src))
        
        # Convert the set back to a list of dictionaries
        unique_links = [{'url': url, 'type': link_type} for link_type, url in links_set]
        
        logging.info(f"Extracted {len(unique_links)} unique child links from {parent_url}.")
        return unique_links
    except Exception as e:
        logging.error(f"Error extracting links from {parent_url}:", exc_info=True)
        return []

def compile_results(df, driver, session_cookies, username, password):
    results = []
    reauth_attempted = False
    checked_links = set()

    if 'URL' not in df.columns:
        logging.error("Excel file does not contain 'URL' column.")
        raise KeyError("Excel file does not contain 'URL' column.")

    total_urls = len(df)
    
    # Create a progress bar
    progress_bar = tqdm(total=total_urls, desc="Processing URLs", unit="URL")

    for index, row in df.iterrows():
        parent_url = row['URL']
        logging.info(f"Processing URL: {parent_url}")
        child_links = extract_links(driver, parent_url)

        if not child_links:
            results.append({
                'Parent URL': parent_url,
                'Child URL': '',
                'Link Type': '',
                'Status': 'No Links Found',
                'Notes': 'Main content container not found.'
            })
            progress_bar.update(1)
            continue

        all_ok = True
        for link in child_links:
            link_url = link['url']
            link_type = link['type']
            
            if link_url in checked_links:
                logging.info(f"Link already checked: {link_url}. Skipping duplicate entry.")
                continue

            status = check_link_status(link_url, session_cookies)
            checked_links.add(link_url)

            if status == "Requires Authentication":
                if not reauth_attempted:
                    reauth_success = re_authenticate(driver, username, password)
                    if reauth_success:
                        session_cookies.clear()
                        updated_cookies = get_session_cookies(driver)
                        session_cookies.update(updated_cookies)
                        reauth_attempted = True
                        status = check_link_status(link_url, session_cookies)
                        if status == "Requires Authentication":
                            logging.error(f"Link still redirects to login after re-authentication: {link_url}")
                            all_ok = False
                    else:
                        logging.error("Re-authentication failed. Cannot proceed with link checking.")
                        status = "Requires Authentication"
                        all_ok = False
                else:
                    logging.error("Re-authentication already attempted. Skipping further re-authentication.")
                    all_ok = False
            elif status == "Broken":
                all_ok = False

            results.append({
                'Parent URL': parent_url,
                'Child URL': link_url,
                'Link Type': link_type,
                'Status': status,
                'Notes': ''
            })

        if all_ok:
            results.append({
                'Parent URL': parent_url,
                'Child URL': '',
                'Link Type': '',
                'Status': 'All Links OK',
                'Notes': 'Everything is working correctly.'
            })
        
        progress_bar.update(1)

    progress_bar.close()
    logging.info(f"Compiled results for {total_urls} URLs.")
    return results

def generate_report(results, output_path):
    try:
        # Convert results to DataFrame
        df_report = pd.DataFrame(results)
        
        # Sort by 'Parent URL' to keep related links together
        df_report.sort_values(by=['Parent URL', 'Child URL'], inplace=True)
        
        with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
            df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
            
            workbook = writer.book
            worksheet = writer.sheets['Link Check Report']
            
            # Define formats
            header_format = workbook.add_format({
                'bold': True,
                'text_wrap': True,
                'valign': 'top',
                'fg_color': '#D7E4BC',
                'border': 1
            })
            
            # Apply header format and set column widths
            for col_num, value in enumerate(df_report.columns.values):
                worksheet.write(0, col_num, value, header_format)
                if value in ['Parent URL', 'Child URL']:
                    worksheet.set_column(col_num, col_num, 50)
                else:
                    worksheet.set_column(col_num, col_num, 20)
            
            # Add alternating row colors
            for row_num in range(1, len(df_report) + 1):
                if row_num % 2 == 0:
                    worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
            
            # Freeze the top row and apply filters
            worksheet.freeze_panes(1, 0)
            worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
        
        logging.info(f"Report generated at '{output_path}' with basic formatting.")
        print(f"Report generated at '{output_path}' with basic formatting.")
    except Exception as e:
        logging.error(f"Failed to generate report at '{output_path}':", exc_info=True)
        raise

def get_session_cookies(driver):
    selenium_cookies = driver.get_cookies()
    session_cookies = {}
    for cookie in selenium_cookies:
        session_cookies[cookie['name']] = cookie['value']
    logging.info("Extracted cookies from Selenium WebDriver.")
    return session_cookies

def main():
    driver = initialize_driver()
    try:
        login_success = login_to_portal(driver, WEB_USERNAME, WEB_PASSWORD)
        if not login_success:
            logging.error("Login failed. Exiting script.")
            return
        print('Logged in successfully.')
        
        # Initialize a requests session
        session = requests.Session()
        # Transfer Selenium cookies to requests session
        session_cookies = get_session_cookies(driver)
        session.cookies.update(session_cookies)
        logging.info("Updated requests session with Selenium cookies.")
        
        df_input = read_excel(EXCEL_INPUT_PATH)
        results = compile_results(df_input, driver, session_cookies, WEB_USERNAME, WEB_PASSWORD)
        
        # Generate timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Create output filename with timestamp
        output_filename = f"URLs_checked_{timestamp}.xlsx"
        output_path = os.path.join(os.path.dirname(EXCEL_OUTPUT_PATH), output_filename)
        
        generate_report(results, output_path)
        
    except KeyError as ke:
        logging.error(f"KeyError: {ke}")
        print(f"Error: {ke}")
    except Exception as e:
        logging.error("An unexpected error occurred in main:", exc_info=True)
        print("An unexpected error occurred. Check the log for details.")
    finally:
        driver.quit()
        logging.info("Closed Selenium WebDriver.")

if __name__ == "__main__":
    main()