#!/usr/bin/env python3
"""
School Website Scraper - Trip Contact Finder
Scrapes school websites to find trip coordinators and contact details.
"""

import csv
import re
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import time
import json

# Target roles we're looking for
TARGET_ROLES = [
    'trip', 'educational visit', 'visit coordinator', 'outdoor education',
    'year 5', 'year 6', 'ks2', 'key stage 2',
    'english', 'stem', 'pe', 'physical education', 'sport',
    'extra-curricular', 'extracurricular', 'activities',
    'educational', 'learning', 'curriculum'
]

# Email regex
EMAIL_REGEX = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

def clean_url(url):
    """Ensure URL has proper scheme."""
    url = url.strip()
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url
    # Remove trailing slash
    url = url.rstrip('/')
    return url

def find_staff_link(soup, base_url):
    """Find Staff/Team/Meet the Team links on the page."""
    staff_keywords = ['staff', 'team', 'people', 'meet the team', 'our team', 'teachers', 'faculty']
    
    # Check navigation
    for nav in soup.find_all('nav'):
        for link in nav.find_all('a', href=True):
            text = link.get_text().lower().strip()
            for kw in staff_keywords:
                if kw in text:
                    return link['href']
    
    # Check main content links
    for link in soup.find_all('a', href=True):
        text = link.get_text().lower().strip()
        for kw in staff_keywords:
            if kw in text and len(text) < 30:
                return link['href']
    
    return None

def find_contact_info(soup, text_content):
    """Find names and titles that match target roles."""
    results = []
    
    # Look for contact cards or staff profiles
    for element in soup.find_all(['div', 'li', 'tr', 'p']):
        text = element.get_text().lower()
        
        # Check if any target role is mentioned
        matched_role = None
        for role in TARGET_ROLES:
            if role in text:
                matched_role = role
                break
        
        if matched_role:
            # Extract email if present
            elem_text = element.get_text()
            emails = EMAIL_REGEX.findall(elem_text)
            
            # Try to extract name (look for capitalized names near the role)
            name = ""
            title = ""
            
            # Look for h-tag or strong text nearby
            for prev in element.find_all_previous(['h1', 'h2', 'h3', 'h4', 'strong', 'b'], limit=3):
                prev_text = prev.get_text().strip()
                if prev_text and len(prev_text) < 50:
                    name = prev_text
                    break
            
            # If no name found, use the element's direct text
            if not name:
                name = elem_text[:50].strip()
            
            title = f"Found: {matched_role}"
            
            results.append({
                'name': name,
                'title': title,
                'email': emails[0] if emails else ''
            })
    
    return results

def scrape_school(school_name, website):
    """Scrape a single school website."""
    website = clean_url(website)
    
    result = {
        'school_name': school_name,
        'name': '',
        'title': '',
        'email': '',
        'website': website,
        'notes': ''
    }
    
    try:
        # Try main page first
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(website, headers=headers, timeout=10, allow_redirects=True)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find any contact info on main page
        contacts = find_contact_info(soup, response.text)
        if contacts:
            result['name'] = contacts[0]['name']
            result['title'] = contacts[0]['title']
            result['email'] = contacts[0]['email']
        
        # Look for staff page
        staff_link = find_staff_link(soup, website)
        
        if staff_link:
            staff_url = urljoin(website, staff_link)
            try:
                staff_response = requests.get(staff_url, headers=headers, timeout=10)
                staff_soup = BeautifulSoup(staff_response.text, 'html.parser')
                
                staff_contacts = find_contact_info(staff_soup, staff_response.text)
                if staff_contacts and not result['name']:
                    result['name'] = staff_contacts[0]['name']
                    result['title'] = staff_contacts[0]['title']
                    result['email'] = staff_contacts[0]['email']
                    
            except Exception as e:
                result['notes'] = f"Staff page found but couldn't fetch: {str(e)[:50]}"
        
        # If no specific contact found, note general info
        if not result['name']:
            # Look for any email on page
            emails = EMAIL_REGEX.findall(response.text)
            if emails:
                result['email'] = emails[0]
                result['notes'] = "No trip coordinator found - general contact only"
            else:
                result['notes'] = "No specific trip coordinator found"
                
    except Exception as e:
        result['notes'] = f"Error: {str(e)[:80]}"
    
    return result

def main():
    # Load schools from queue
    queue_file = '/data/.openclaw/workspace/Immersia XR/scrape_queue.csv'
    results_file = '/data/.openclaw/workspace/Immersia XR/scraped_contacts_results.csv'
    
    schools = []
    with open(queue_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get('School Name') and row.get('Website'):
                schools.append({
                    'name': row['School Name'],
                    'website': row['Website']
                })
    
    print(f"Loaded {len(schools)} schools to scrape")
    
    # Track results
    all_results = []
    
    # Process each school
    for i, school in enumerate(schools):
        print(f"[{i+1}/{len(schools)}] Scraping: {school['name']}...")
        
        result = scrape_school(school['name'], school['website'])
        all_results.append(result)
        
        print(f"  -> {result['name'] or 'No contact'} | {result['title'] or 'N/A'}")
        
        # Small delay to be respectful
        time.sleep(0.5)
        
        # Save periodically
        if (i + 1) % 20 == 0:
            save_results(all_results, results_file)
            print(f"  >> Saved {i+1} results so far...")
    
    # Final save
    save_results(all_results, results_file)
    
    # Summary
    contacts_found = sum(1 for r in all_results if r['name'])
    print(f"\n=== COMPLETE ===")
    print(f"Total schools scraped: {len(schools)}")
    print(f"Contacts found: {contacts_found}")
    
    return contacts_found

def save_results(results, filepath):
    """Save results to CSV."""
    with open(filepath, 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['school_name', 'name', 'title', 'email', 'website', 'notes'])
        
        # Check if file is empty
        f.seek(0, 2)
        if f.tell() == 0:
            writer.writeheader()
        
        for r in results:
            writer.writerow({
                'school_name': r['school_name'],
                'name': r['name'],
                'title': r['title'],
                'email': r['email'],
                'website': r['website'],
                'notes': r['notes']
            })

if __name__ == '__main__':
    main()