import streamlit as st
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
from bs4 import BeautifulSoup
import logging
import xlsxwriter
from datetime import datetime
import os
from io import BytesIO

# Setup logging
logging.basicConfig(
    filename='link_checker_streamlit.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

def initialize_driver(wait_time, element_wait_time):
    options = webdriver.ChromeOptions()
    # Uncomment the next line to run Chrome in headless mode
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(wait_time)
    logging.info("Initialized Selenium WebDriver with wait times.")
    return driver

def accept_cookies(driver, element_wait_time):
    try:
        accept_button = WebDriverWait(driver, element_wait_time).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        accept_button.click()
        logging.info("Accepted cookies.")
        time.sleep(2)
    except Exception:
        logging.info("No cookies acceptance button found or already accepted.")

def login_to_portal(driver, username, password, login_url, element_wait_time):
    try:
        driver.get(login_url)
        logging.info("Navigated to login page.")
        time.sleep(2)
        accept_cookies(driver, element_wait_time)
        
        username_field = WebDriverWait(driver, element_wait_time).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_username"))
        )
        username_field.clear()
        username_field.send_keys(username)
        logging.info("Entered username.")
        
        next_button = WebDriverWait(driver, element_wait_time).until(
            EC.element_to_be_clickable((By.ID, "buttonNext_signInFull"))
        )
        next_button.click()
        logging.info("Clicked 'Siguiente' button.")
        time.sleep(10)
        
        username_field_retry = WebDriverWait(driver, element_wait_time).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_signInUsername"))
        )
        username_field_retry.clear()
        username_field_retry.send_keys(username)
        logging.info("Re-entered username.")
        
        password_field = WebDriverWait(driver, element_wait_time).until(
            EC.presence_of_element_located((By.ID, "capture_signInFull_currentPassword"))
        )
        password_field.clear()
        password_field.send_keys(password)
        logging.info("Entered password.")
        
        submit_button = WebDriverWait(driver, element_wait_time).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@type='submit' and contains(., 'Acceda')]"))
        )
        submit_button.click()
        logging.info("Clicked 'Acceda' button.")
        time.sleep(5)
        
        try:
            error_message = driver.find_element(By.CSS_SELECTOR, ".error-message-selector")  # Update with the actual selector for the error message
            if error_message:
                logging.error("Login failed: Incorrect username or password.")
                return False
        except:
            logging.info("Logged in successfully.")
            return True
    except Exception as e:
        logging.error("An error occurred during login.", exc_info=True)
        driver.quit()
        return False

def read_excel(file):
    try:
        df = pd.read_excel(file)
        logging.info("Excel file read successfully.")
        return df
    except FileNotFoundError:
        logging.error("Excel file not found.")
        st.error("Excel file not found.")
        raise
    except Exception as e:
        logging.error("Error reading Excel file.", exc_info=True)
        st.error("Error reading Excel file.")
        raise

def get_session_cookies(driver):
    selenium_cookies = driver.get_cookies()
    session_cookies = {}
    for cookie in selenium_cookies:
        session_cookies[cookie['name']] = cookie['value']
    logging.info("Extracted cookies from Selenium WebDriver.")
    return session_cookies

def check_link_status(url, session_cookies, login_url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
        }
        response = requests.head(url, allow_redirects=True, timeout=15, headers=headers, cookies=session_cookies)
        
        final_url = response.url
        if final_url.startswith(login_url):
            logging.warning(f"Link redirects to login page: {url}")
            return "Requires Authentication"
        
        if response.status_code < 400:
            return "OK"
        else:
            logging.warning(f"Broken link detected: {url} with status {response.status_code}")
            return "Broken"
    except requests.RequestException as e:
        logging.error(f"Request exception for URL {url}: {e}")
        return "Broken"
    except Exception as e:
        logging.error(f"Unexpected error for URL {url}: {e}", exc_info=True)
        return "Broken"

def check_link_status_without_login(url, login_url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; LinkChecker/1.0)"
        }
        response = requests.head(url, allow_redirects=True, timeout=15, headers=headers)
        
        final_url = response.url
        if final_url.startswith(login_url):
            return "Requires Authentication"
        
        if response.status_code < 400:
            return "Publicly Accessible"
        else:
            return "Broken"
    except requests.RequestException as e:
        logging.error(f"Request exception for URL {url}: {e}")
        return "Broken"
    except Exception as e:
        logging.error(f"Unexpected error for URL {url}: {e}", exc_info=True)
        return "Broken"


def extract_links(driver, parent_url, wait_time, element_wait_time, max_retries=3, retry_delay=5):
    import time
    from selenium.common.exceptions import WebDriverException
    from bs4 import BeautifulSoup

    for attempt in range(1, max_retries + 1):
        try:
            driver.get(parent_url)
            logging.info(f"Navigated to URL: {parent_url}")
            break
        except WebDriverException as e:
            if "disconnected: not connected to DevTools" in str(e):
                logging.error(f"WebDriverException on attempt {attempt} for URL {parent_url}: {e}")
                if attempt < max_retries:
                    logging.info(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    logging.error(f"Max retries reached for URL {parent_url}. Attempting to re-initialize driver.")
                    try:
                        driver.quit()
                    except Exception:
                        pass
                    driver = initialize_driver(wait_time, element_wait_time)
                    try:
                        driver.get(parent_url)
                        logging.info(f"Navigated to URL after re-initializing driver: {parent_url}")
                        break
                    except WebDriverException as e:
                        logging.error(f"Failed to navigate after re-initializing driver for URL {parent_url}: {e}")
                        return []
            else:
                logging.error(f"Unexpected WebDriverException for URL {parent_url}: {e}", exc_info=True)
                return []
        except Exception as e:
            logging.error(f"Error extracting links from {parent_url}.", exc_info=True)
            return []

    else:
        logging.error(f"Failed to navigate to {parent_url} after {max_retries} attempts.")
        return []

    try:
        time.sleep(wait_time)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        main_content = soup.find(id="mhh_mcn_main")
        if not main_content:
            logging.warning(f"No main content found for URL: {parent_url}")
            return []

        links_set = set()
        for a_tag in main_content.find_all('a', href=True):
            href = a_tag['href']
            if not href.startswith('#'):
                links_set.add(('Article/Product', href))

        for img_tag in main_content.find_all('img', src=True):
            src = img_tag['src']
            links_set.add(('Image', src))

        unique_links = [{'url': url, 'type': link_type} for link_type, url in links_set]

        logging.info(f"Extracted {len(unique_links)} unique child links from {parent_url}.")
        return unique_links
    except Exception as e:
        logging.error(f"Error processing page content from {parent_url}.", exc_info=True)
        return []


def compile_results(df, driver, session_cookies, username, password, login_url, wait_time, element_wait_time, without_login):
    results = []
    reauth_attempted = False
    link_status_dict = {}  # Changed from checked_links = set()

    if 'URL' not in df.columns:
        logging.error("Excel file does not contain 'URL' column.")
        st.error("Excel file does not contain 'URL' column.")
        raise KeyError("Excel file does not contain 'URL' column.")

    total_urls = len(df)
    progress_bar = st.progress(0)
    progress_text = st.empty()

    for index, row in df.iterrows():
        parent_url = row['URL']
        logging.info(f"Processing URL: {parent_url}")
        
        if without_login:
            status = check_link_status_without_login(parent_url, login_url)
            note = ""
            if status == "Publicly Accessible":
                note = "URL is accessible without authentication."
            elif status == "Requires Authentication":
                note = "URL correctly requires authentication."
            results.append({
                'Parent URL': parent_url,
                'Child URL': '',
                'Link Type': '',
                'Status': status,
                'Notes': note
            })
        else:
            child_links = extract_links(driver, parent_url, wait_time, element_wait_time)

            if not child_links:
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'No Links Found',
                    'Notes': 'Main content container not found.'
                })
                progress = (index + 1) / total_urls
                progress_bar.progress(progress)
                progress_text.text(f"Processed {index + 1} of {total_urls} URLs.")
                continue

            all_ok = True  # Initialize as True for each parent URL
            for link in child_links:
                link_url = link['url']
                link_type = link['type']
                
                if link_url in link_status_dict:  # Check if link has been processed
                    status = link_status_dict[link_url]
                    logging.info(f"Link already checked: {link_url}. Using cached status.")
                else:
                    status = check_link_status(link_url, session_cookies, login_url)
                    link_status_dict[link_url] = status  # Store status in dictionary

                    if status == "Requires Authentication":
                        if not reauth_attempted:
                            reauth_success = login_to_portal(driver, username, password, login_url, element_wait_time)
                            if reauth_success:
                                session_cookies.clear()
                                updated_cookies = get_session_cookies(driver)
                                session_cookies.update(updated_cookies)
                                reauth_attempted = True
                                status = check_link_status(link_url, session_cookies, login_url)
                                link_status_dict[link_url] = status  # Update status after re-authentication
                                if status == "Requires Authentication":
                                    logging.error(f"Link still redirects to login after re-authentication: {link_url}")
                                    all_ok = False
                            else:
                                logging.error("Re-authentication failed. Cannot proceed with link checking.")
                                status = "Requires Authentication"
                                link_status_dict[link_url] = status
                                all_ok = False
                        else:
                            logging.error("Re-authentication already attempted. Skipping further re-authentication.")
                            all_ok = False
                    elif status == "Broken":
                        all_ok = False  # Set to False if any child link is broken

                results.append({
                    'Parent URL': parent_url,
                    'Child URL': link_url,
                    'Link Type': link_type,
                    'Status': status,
                    'Notes': ''
                })

            if all_ok:
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'All Links OK',
                    'Notes': 'Everything is working correctly.'
                })
            else:
                results.append({
                    'Parent URL': parent_url,
                    'Child URL': '',
                    'Link Type': '',
                    'Status': 'Some Links Broken',
                    'Notes': 'One or more child links are broken.'
                })
        
        # Update progress bar and text for both scenarios
        progress = (index + 1) / total_urls
        progress_bar.progress(progress)
        progress_text.text(f"Processed {index + 1} of {total_urls} URLs.")

    logging.info(f"Compiled results for {total_urls} URLs.")
    progress_text.text("Processing complete.")
    return results

def generate_report(results):
    try:
        df_report = pd.DataFrame(results)
        df_report.sort_values(by=['Parent URL', 'Child URL'], inplace=True)
        
        output = BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            df_report.to_excel(writer, index=False, sheet_name='Link Check Report')
            
            workbook = writer.book
            worksheet = writer.sheets['Link Check Report']
            
            header_format = workbook.add_format({
                'bold': True,
                'text_wrap': True,
                'valign': 'top',
                'fg_color': '#D7E4BC',
                'border': 1
            })
            
            for col_num, value in enumerate(df_report.columns.values):
                worksheet.write(0, col_num, value, header_format)
                if value in ['Parent URL', 'Child URL']:
                    worksheet.set_column(col_num, col_num, 50)
                else:
                    worksheet.set_column(col_num, col_num, 20)
            
            for row_num in range(1, len(df_report) + 1):
                if row_num % 2 == 0:
                    worksheet.set_row(row_num, None, workbook.add_format({'bg_color': '#F0F0F0'}))
            
            worksheet.freeze_panes(1, 0)
            worksheet.autofilter(0, 0, len(df_report), len(df_report.columns) - 1)
        
        output.seek(0)
        logging.info("Report generated in memory with basic formatting.")
        return output
    except Exception as e:
        logging.error(f"Failed to generate report in memory.", exc_info=True)
        st.error("Failed to generate report.")
        raise

st.set_page_config(page_title="MSD Broken Link Checker", layout="wide")
st.image("MSD_logo.png", width=200)
st.markdown("<h1 style='text-align: center;'>MSD Broken Link Checker</h1>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center;'>Upload your Excel file to check for broken links on your portal.</p>", unsafe_allow_html=True)


with st.form("link_checker_form"):
    st.header("Setup")
    without_login = st.checkbox("Check Links Without Login")
    
    username = st.text_input("Web Username")
    password = st.text_input("Web Password", type="password")
    
    #if without_login:
    #    st.write("**Note:** Username and Password fields are disabled.")
    
    uploaded_file = st.file_uploader("Upload Excel File with URLs", type=["xlsx", "xls"])
    st.info("The report will be available for download after processing.")
    st.markdown("""
    <details>
    <summary>Advanced Settings</summary>
    - **Wait Time:** Time in seconds the script waits after navigating to a page.
    - **Element Wait Time:** Time in seconds the script waits for elements to load.
    </details>
    """, unsafe_allow_html=True)
    wait_time = st.number_input("Wait Time (seconds)", min_value=1, max_value=60, value=4, help="Time in seconds the script waits after navigating to a page.")
    element_wait_time = st.number_input("Element Wait Time (seconds)", min_value=1, max_value=60, value=15, help="Time in seconds the script waits for elements to load.")
    submit_button = st.form_submit_button("Check Links")

if submit_button:
    if without_login:
        if not uploaded_file:
            st.error("Please upload the Excel file.")
    else:
        if not all([username, password, uploaded_file]):
            st.error("Please fill in all the fields.")
    
    if without_login and uploaded_file:
        try:
            with st.spinner('Reading Excel file...'):
                df_input = read_excel(uploaded_file)
            login_url = "https://profesionales.msd.es/login/"
            with st.spinner('Checking links without login...'):
                results = compile_results(df_input, None, None, username, password, login_url, wait_time, element_wait_time, without_login=True)
            
            with st.spinner('Generating report...'):
                report_buffer = generate_report(results)
            
            st.success("Report generation complete.")
            
            # Provide a download button for the report
            st.download_button(
                label="Download Report",
                data=report_buffer,
                file_name=f"{os.path.splitext(uploaded_file.name)[0]}_checked_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        except Exception:
            st.error("An unexpected error occurred. Please check the logs for details.")
    elif not without_login and all([username, password, uploaded_file]):
        try:
            with st.spinner('Initializing WebDriver...'):
                driver = initialize_driver(wait_time, element_wait_time)
            login_url = "https://profesionales.msd.es/login/"
            with st.spinner('Logging in...'):
                login_success = login_to_portal(driver, username, password, login_url, element_wait_time)
            if not login_success:
                st.error("Login failed. Please check your credentials.")
            else:
                st.success("Logged in successfully.")
                with st.spinner('Reading Excel file...'):
                    df_input = read_excel(uploaded_file)
                with st.spinner('Checking links...'):
                    session_cookies = get_session_cookies(driver)
                    results = compile_results(df_input, driver, session_cookies, username, password, login_url, wait_time, element_wait_time, without_login=False)
                
                with st.spinner('Generating report...'):
                    report_buffer = generate_report(results)
                
                st.success("Report generation complete.")
                
                # Provide a download button for the report
                st.download_button(
                    label="Download Report",
                    data=report_buffer,
                    file_name=f"{os.path.splitext(uploaded_file.name)[0]}_checked_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )
        except Exception:
            st.error("An unexpected error occurred. Please check the logs for details.")
        finally:
            driver.quit()
        
st.markdown("""
        <style>
        .version-footer {
            position: fixed;
            left: 0;
            bottom: 10px;
            width: 100%;
            text-align: center;
            font-size: 12px;
            color: gray;
        }
        </style>
        <div class="version-footer">
            V.0.2
        </div>
        """, unsafe_allow_html=True)