#!/usr/bin/env python3
import os
import sys
import argparse
import pandas as pd
import time
import logging
import datetime
import json
import tempfile
import re
from dotenv import load_dotenv
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Setup logging
log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'product_inventory.log')
logging.basicConfig(
    filename=log_file,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logging.info("Starting app_product_inventory.py with log file: %s", log_file)

def login_to_portal(username, password, login_url, headless=True):
    """Login to the portal and save storage state to a temp file for reuse"""
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        logging.info(f"Attempting login for user: {username}")
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=headless)
        context = browser.new_context()
        page = context.new_page()
        
        logging.info("Navigating to login page...")
        page.goto(login_url)
        time.sleep(1)
        
        # Accept cookies if present
        try:
            accept_button = page.locator("#onetrust-accept-btn-handler")
            if accept_button.is_visible(timeout=3000):
                accept_button.click()
                logging.info("Accepted cookies during login attempt")
                time.sleep(1)
        except Exception as e:
            logging.info(f"Cookie dialog not found or error during login: {e}")

        # Enter username
        page.locator("#capture_signInFull_username").fill(username)
        logging.info("Entered username")
        
        # Click next
        page.locator("#buttonNext_signInFull").click()
        logging.info("Clicked 'Siguiente' button")
        
        # Re-enter username and password
        username_field_retry = page.locator("#capture_signInFull_signInUsername")
        username_field_retry.wait_for(state='visible', timeout=10000)
        if username_field_retry.is_visible():
            username_field_retry.fill(username)
            logging.info("Re-entered username")
        
        page.locator("#capture_signInFull_currentPassword").fill(password)
        logging.info("Entered password")
        
        # Submit
        page.get_by_role("button", name="Acceda").click()
        logging.info("Clicked 'Acceda' button")

        # Wait for login confirmation
        page.wait_for_url(lambda url: "login" not in url, timeout=25000)
        logging.info(f"Login successful, redirected to: {page.url}")
        
        # Save storage state to a temporary file
        storage_state = context.storage_state()
        
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
            json.dump(storage_state, temp_file)
            storage_state_path = temp_file.name
            logging.info(f"Saved storage state to temporary file: {storage_state_path}")
        
        return True, storage_state_path
    except Exception as e:
        logging.error(f"An error occurred during login: {e}", exc_info=True)
        return False, None
    finally:
        # Cleanup resources used for login
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def read_user_pass_file(file_path):
    """Read username:password pairs from txt file"""
    try:
        user_pass_pairs = []
        with open(file_path, 'r') as f:
            for line in f:
                line = line.strip()
                if ':' in line:
                    username, password = line.split(':', 1)
                    user_pass_pairs.append((username.strip(), password.strip()))
        logging.info(f"User/pass file read successfully: {file_path}, found {len(user_pass_pairs)} pairs")
        return user_pass_pairs
    except Exception as e:
        logging.error(f"Error reading user/pass file: {e}", exc_info=True)
        raise

def read_txt_urls(file_path):
    """Read URLs from txt file, one per line"""
    try:
        with open(file_path, 'r') as f:
            urls = [line.strip() for line in f if line.strip()]
        logging.info(f"Txt file read successfully: {file_path}, processing {len(urls)} URLs")
        return urls
    except Exception as e:
        logging.error(f"Error reading txt file: {e}", exc_info=True)
        raise

def extract_audience_from_username(username):
    """Extract audience from username like WPVipSpain+WPAAnatomiaPatologica@msd.com -> WPAAnatomiaPatologica"""
    try:
        # Extract the part between + and @
        match = re.search(r'\+(.+?)@', username)
        if match:
            return match.group(1)
        return "Unknown"
    except Exception as e:
        logging.error(f"Error extracting audience from {username}: {e}")
        return "Unknown"

def parse_url_structure(url):
    """Parse URL to extract therapeutic area and pathology
    
    Examples:
    - https://profesionales.msd.es/areas_terapeuticas/oncologia/ -> ('oncologia', None)
    - https://profesionales.msd.es/areas_terapeuticas/oncologia/cancer-de-cervix/ -> ('oncologia', 'cancer-de-cervix')
    """
    try:
        parsed = urlparse(url)
        path_parts = [p for p in parsed.path.split('/') if p]
        
        # Find 'areas_terapeuticas' in the path
        if 'areas_terapeuticas' in path_parts:
            idx = path_parts.index('areas_terapeuticas')
            
            # Get therapeutic area (next element after areas_terapeuticas)
            therapeutic_area = path_parts[idx + 1] if len(path_parts) > idx + 1 else None
            
            # Get pathology (element after therapeutic area)
            pathology = path_parts[idx + 2] if len(path_parts) > idx + 2 else None
            
            return therapeutic_area, pathology
        
        return None, None
    except Exception as e:
        logging.error(f"Error parsing URL structure for {url}: {e}")
        return None, None

def extract_products_from_page(storage_state_path, page_url, wait_time=4, verbose=False):
    """Extract products from a webpage using Playwright and BeautifulSoup
    
    Returns list of product names extracted from /productos/ URLs
    """
    playwright = None
    browser = None
    context = None
    page = None
    
    try:
        if verbose:
            print(f"  Opening page to extract products...")
            
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=True)
        
        # Load storage state from the temporary file
        with open(storage_state_path, 'r') as f:
            storage_state = json.load(f)
        
        context = browser.new_context(storage_state=storage_state)
        page = context.new_page()
        
        page.goto(page_url, wait_until="networkidle", timeout=60000)
        logging.info(f"Navigated to URL: {page_url}")
        page.wait_for_timeout(wait_time * 1000)  # Convert seconds to milliseconds
        
        if verbose:
            print(f"  Parsing HTML content...")
            
        html_content = page.content()
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Debug: Save HTML to file if verbose
        if verbose:
            debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'debug_html')
            os.makedirs(debug_dir, exist_ok=True)
            debug_file = os.path.join(debug_dir, f"debug_page_{datetime.datetime.now().strftime('%H%M%S')}.html")
            with open(debug_file, 'w', encoding='utf-8') as f:
                f.write(html_content)
            print(f"  [DEBUG] Saved page HTML to: {debug_file}")
        
        products = []
        
        # IMPORTANT: Only search within main content area to avoid picking up navigation menu
        main_content = soup.find(id="mhh_mcn_main")
        if not main_content:
            logging.warning(f"No main content found (id='mhh_mcn_main') for URL: {page_url}")
            if verbose:
                print(f"  [WARNING] No main content container found (id='mhh_mcn_main')")
            return []
        
        if verbose:
            print(f"  [DEBUG] Found main content area, searching for products...")
        
        # Look for the specific product container structure with class pattern
        # Class contains: mhh-mcn-columns mhh-mcn-v1-columns--[hash] mhh-mcn-v1-columns mhh-mcn-columns--gutter-m
        product_containers = main_content.find_all('div', class_=lambda x: x and 'mhh-mcn-columns' in x and 'mhh-mcn-v1-columns' in x and 'gutter' in x)
        
        if verbose:
            print(f"  [DEBUG] Found {len(product_containers)} product containers in main content")
        
        for container in product_containers:
            # Find columns within this container
            columns = container.find_all('div', class_=lambda x: x and 'mhh-mcn-column' in x and 'mhh-mcn-v1-column' in x)
            
            if verbose:
                print(f"  [DEBUG] Found {len(columns)} columns in container")
            
            for column in columns:
                # Find all anchors with /productos/ in this column
                anchors = column.find_all('a', href=lambda x: x and '/productos/' in x)
                
                for anchor in anchors:
                    href = anchor.get('href', '')
                    # Extract the product name (the part after /productos/)
                    product_match = re.search(r'/productos/([^/]+)/?', href)
                    if product_match:
                        product_name = product_match.group(1)
                        if product_name not in products:
                            products.append(product_name)
                            if verbose:
                                print(f"    Found product: {product_name}")
        
        logging.info(f"Extracted {len(products)} products from {page_url}")
        if verbose:
            print(f"  Extracted {len(products)} products: {products}")
        
        return products
        
    except Exception as e:
        logging.error(f"Error extracting products from {page_url}: {e}", exc_info=True)
        if verbose:
            print(f"  Error extracting products: {str(e)}")
        return []
    finally:
        if page: page.close()
        if context: context.close()
        if browser: browser.close()
        if playwright: playwright.stop()

def inventory_homes(user_pass_pairs, homepage_url, login_url, wait_time, verbose=False):
    """Inventory products on home page for each user
    
    Returns list of dicts with structure:
    {
        'url': homepage_url,
        'user': username,
        'audience': audience,
        'products': [list of products]
    }
    """
    results = []
    
    for username, password in user_pass_pairs:
        logging.info(f"Processing user: {username}")
        if verbose:
            print(f"\nProcessing user: {username}")
        
        # Extract audience
        audience = extract_audience_from_username(username)
        
        # Login for this user
        login_success, storage_state_path = login_to_portal(username, password, login_url, headless=True)
        if not login_success:
            logging.error(f"Login failed for user: {username}")
            if verbose:
                print(f"  Login failed for user: {username}")
            results.append({
                'url': homepage_url,
                'user': username,
                'audience': audience,
                'products': [],
                'error': 'Login Failed'
            })
            continue
        
        # Extract products
        products = extract_products_from_page(storage_state_path, homepage_url, wait_time, verbose)
        
        results.append({
            'url': homepage_url,
            'user': username,
            'audience': audience,
            'products': products
        })
        
        # Clean up
        if storage_state_path and os.path.exists(storage_state_path):
            os.unlink(storage_state_path)
        
        if verbose:
            print(f"  Found {len(products)} products for audience {audience}")
    
    return results

def inventory_urls(urls, default_username, default_password, login_url, wait_time, verbose=False):
    """Inventory products from a list of URLs (therapeutic areas and pathologies)
    
    Returns tuple of (therapeutic_areas, pathologies):
    - therapeutic_areas: list of dicts with structure:
        {
            'url': url,
            'name': 'oncologia',
            'products': [list of products]
        }
    - pathologies: list of dicts with structure:
        {
            'url': url,
            'therapeutic_area': 'oncologia',
            'pathology_name': 'cancer-de-cervix',
            'products': [list of products]
        }
    """
    therapeutic_areas = []
    pathologies = []
    
    # Login with default user
    if verbose:
        print(f"\nLogging in with default user: {default_username}")
    
    login_success, storage_state_path = login_to_portal(default_username, default_password, login_url, headless=True)
    if not login_success:
        logging.error("Default login failed. Cannot proceed with URL inventory.")
        if verbose:
            print("  Default login failed!")
        return therapeutic_areas, pathologies
    
    # Process each URL
    for url in urls:
        if verbose:
            print(f"\nProcessing URL: {url}")
        
        # Parse URL structure
        therapeutic_area, pathology = parse_url_structure(url)
        
        if not therapeutic_area:
            logging.warning(f"Could not parse URL structure for: {url}")
            if verbose:
                print(f"  Warning: Could not parse URL structure")
            continue
        
        # Extract products
        products = extract_products_from_page(storage_state_path, url, wait_time, verbose)
        
        # Classify as therapeutic area or pathology
        if pathology is None:
            # This is a therapeutic area
            therapeutic_areas.append({
                'url': url,
                'name': therapeutic_area,
                'products': products
            })
            if verbose:
                print(f"  Therapeutic Area: {therapeutic_area} - {len(products)} products")
        else:
            # This is a pathology
            pathologies.append({
                'url': url,
                'therapeutic_area': therapeutic_area,
                'pathology_name': pathology,
                'products': products
            })
            if verbose:
                print(f"  Pathology: {therapeutic_area}/{pathology} - {len(products)} products")
    
    # Clean up
    if storage_state_path and os.path.exists(storage_state_path):
        os.unlink(storage_state_path)
    
    return therapeutic_areas, pathologies

def generate_json_report(homes_data, therapeutic_areas_data, pathologies_data, output_path=None):
    """Generate JSON report with product inventory
    
    Structure:
    {
        "timestamp": "2024-10-09T12:00:00",
        "homes": [...],
        "therapeutic_areas": [...],
        "pathologies": [...]
    }
    """
    report = {
        "timestamp": datetime.datetime.now().isoformat(),
        "homes": homes_data,
        "therapeutic_areas": therapeutic_areas_data,
        "pathologies": pathologies_data
    }
    
    # Generate filename if not provided
    if not output_path:
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = f"product_inventory_{timestamp}.json"
    
    # Ensure output directory exists
    output_dir = os.path.dirname(os.path.abspath(output_path))
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    # Write JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)
    
    logging.info(f"JSON report generated: {output_path}")
    return output_path

def print_summary(homes_data, therapeutic_areas_data, pathologies_data):
    """Print summary statistics"""
    print("\n" + "="*60)
    print("PRODUCT INVENTORY SUMMARY")
    print("="*60)
    
    print(f"\nHomes:")
    print(f"  Total users checked: {len(homes_data)}")
    for home in homes_data:
        status = f"({home['error']})" if 'error' in home else f"{len(home['products'])} products"
        print(f"    - {home['audience']}: {status}")
    
    print(f"\nTherapeutic Areas:")
    print(f"  Total checked: {len(therapeutic_areas_data)}")
    for ta in therapeutic_areas_data:
        print(f"    - {ta['name']}: {len(ta['products'])} products")
    
    print(f"\nPathologies:")
    print(f"  Total checked: {len(pathologies_data)}")
    for path in pathologies_data:
        print(f"    - {path['therapeutic_area']}/{path['pathology_name']}: {len(path['products'])} products")
    
    print("\n" + "="*60)

def main():
    parser = argparse.ArgumentParser(description='MSD Product Inventory - Extract products from pages by audience')
    parser.add_argument('--user-pass-file', required=True, help='Path to txt file with username:password pairs for home pages')
    parser.add_argument('--urls-file', required=True, help='Path to txt file with therapeutic area and pathology URLs')
    parser.add_argument('--output', help='Path to save the JSON output (default: product_inventory_TIMESTAMP.json)')
    parser.add_argument('--wait-time', type=int, default=4, help='Wait time in seconds after page loads (default: 4)')
    parser.add_argument('--default-username', help='Default username for therapeutic areas/pathologies (overrides .env file)')
    parser.add_argument('--default-password', help='Default password for therapeutic areas/pathologies (overrides .env file)')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output showing progress')
    
    args = parser.parse_args()
    
    # Load environment variables
    load_dotenv()
    
    # Determine login credentials for additional URLs
    default_username = args.default_username or os.environ.get('MSD_USERNAME')
    default_password = args.default_password or os.environ.get('MSD_PASSWORD')
    login_url = "https://profesionales.msd.es/login/"
    homepage_url = "https://profesionales.msd.es/"
    
    if not default_username or not default_password:
        logging.error("Default username and password are required")
        sys.exit("Error: Default username and password are required. "
                 "Set them in .env file or provide as arguments.")
    
    try:
        # Read user:pass pairs for homes
        if args.verbose:
            print(f"Reading user:pass file: {args.user_pass_file}")
        user_pass_pairs = read_user_pass_file(args.user_pass_file)
        
        # Read URLs for therapeutic areas and pathologies
        if args.verbose:
            print(f"Reading URLs file: {args.urls_file}")
        urls = read_txt_urls(args.urls_file)
        
        # Inventory homes for each user
        if args.verbose:
            print(f"\n{'='*60}")
            print(f"PHASE 1: INVENTORYING HOMES")
            print(f"{'='*60}")
        
        homes_data = inventory_homes(
            user_pass_pairs,
            homepage_url,
            login_url,
            args.wait_time,
            args.verbose
        )
        
        # Inventory therapeutic areas and pathologies with default user
        if args.verbose:
            print(f"\n{'='*60}")
            print(f"PHASE 2: INVENTORYING THERAPEUTIC AREAS & PATHOLOGIES")
            print(f"{'='*60}")
        
        therapeutic_areas_data, pathologies_data = inventory_urls(
            urls,
            default_username,
            default_password,
            login_url,
            args.wait_time,
            args.verbose
        )
        
        # Generate JSON report
        if args.verbose:
            print(f"\n{'='*60}")
            print(f"GENERATING REPORT")
            print(f"{'='*60}")
        
        output_path = generate_json_report(
            homes_data,
            therapeutic_areas_data,
            pathologies_data,
            args.output
        )
        
        print(f"\nJSON report saved to: {output_path}")
        
        # Print summary
        if args.verbose:
            print_summary(homes_data, therapeutic_areas_data, pathologies_data)
        
        logging.info("Product inventory completed successfully")
        print("\nProduct inventory completed successfully!")
        
        return 0
        
    except Exception as e:
        logging.error(f"Error in main function: {e}", exc_info=True)
        sys.exit(f"Error: {e}")

if __name__ == "__main__":
    sys.exit(main())