#!/usr/bin/env python3
"""
School Website Scraper - Trip Contact Finder
Simple, robust scraper using requests
"""

import csv
import re
import subprocess
import time
import sys

EMAIL_REGEX = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}')
TRIP_KEYWORDS = ['trip', 'visit', 'educational', 'outdoor', 'ks2', 'year 5', 'year 6', 
                 'extra curricular', 'extracurricular', 'activity', 'pe ', ' sport ', 
                 'stem', 'english', 'coordinator', 'lead']

def clean_url(url):
    url = url.strip().strip('"')
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url
    return url.rstrip('/')

def curl_fetch(url):
    """Fetch URL using curl"""
    try:
        result = subprocess.run(
            ['curl', '-s', '-L', '-m', '15', '--connect-timeout', '10', url],
            capture_output=True, text=True, timeout=20
        )
        return result.stdout if result.returncode == 0 else None
    except:
        return None

def find_emails(content):
    if not content:
        return []
    emails = re.findall(EMAIL_REGEX, content)
    return list(dict.fromkeys(emails))[:3]

def find_staff_link(content, base_url):
    """Look for staff/team page links"""
    if not content:
        return None
    
    # Simple pattern matching for staff links
    patterns = [
        r'href="([^"]*(?:staff|team|people|teachers|faculty)[^"]*)"',
        r"href='([^']*(?:staff|team|people|teachers|faculty)[^']*)'",
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, content, re.IGNORECASE)
        if matches:
            link = matches[0]
            if link.startswith('http'):
                return link
            elif link.startswith('/'):
                parsed = re.match(r'(https?://[^/]+)', base_url)
                if parsed:
                    return parsed.group(1) + link
            else:
                return base_url + '/' + link
    return None

def find_trip_info(content):
    """Look for trip-related information"""
    if not content:
        return None
    
    for kw in TRIP_KEYWORDS:
        if kw.lower() in content.lower():
            return kw
    return None

def scrape_school(name, website):
    website = clean_url(website)
    
    result = {
        'school': name,
        'name': '',
        'title': '',
        'email': '',
        'website': website,
        'notes': ''
    }
    
    # Fetch main page
    content = curl_fetch(website)
    
    if not content:
        result['notes'] = 'Failed to fetch website'
        return result
    
    # Get emails
    emails = find_emails(content)
    if emails:
        result['email'] = emails[0]
    
    # Check for trip keywords
    trip_info = find_trip_info(content)
    if trip_info:
        result['notes'] = f'Trip/visit mention found ({trip_info})'
    
    # Try staff page
    staff_url = find_staff_link(content, website)
    if staff_url:
        staff_content = curl_fetch(staff_url)
        if staff_content:
            staff_emails = find_staff_emails(staff_content)
            if staff_emails and not result['email']:
                result['email'] = staff_emails[0]
            
            # Check for trip roles in staff page
            staff_trip = find_trip_info(staff_content)
            if staff_trip:
                result['notes'] = f'Trip/activity role found on staff page'
                
                # Try to extract a name
                lines = staff_content.split('\n')
                for i, line in enumerate(lines):
                    if any(kw in line.lower() for kw in TRIP_KEYWORDS):
                        # Look for name in nearby text
                        context = ' '.join(lines[max(0,i-2):i+3])
                        names = re.findall(r'<[^>]*>([A-Z][a-z]+ [A-Z][a-z]+)<', context)
                        if names:
                            result['name'] = names[0]
                            result['title'] = f'Found: {staff_trip}'
                            break
    
    if not result['notes']:
        result['notes'] = 'No specific trip coordinator found'
    
    if not result['email']:
        result['notes'] += ' - contact via school office'
    
    return result

def find_staff_emails(content):
    """Find emails on staff page"""
    if not content:
        return []
    emails = re.findall(EMAIL_REGEX, content)
    return list(dict.fromkeys(emails))[:3]

def main():
    queue_file = '/data/.openclaw/workspace/Immersia XR/scrape_queue.csv'
    results_file = '/data/.openclaw/workspace/Immersia XR/scraped_contacts_results.csv'
    
    # Read queue
    schools = []
    with open(queue_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for row in reader:
            if len(row) >= 2 and row[0] and row[1]:
                schools.append((row[0], row[1]))
    
    print(f'Processing {len(schools)} schools...')
    
    # Backup current results
    subprocess.run(['cp', results_file, results_file + '.backup'])
    
    # Count existing entries
    with open(results_file, 'r', encoding='utf-8') as f:
        existing = sum(1 for _ in f) - 1  # minus header
    
    print(f'Existing entries: {existing}')
    
    # Process each school
    contacts_found = 0
    
    with open(results_file, 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        
        for i, (name, website) in enumerate(schools):
            print(f'[{i+1}/{len(schools)}] {name[:40]}...')
            
            result = scrape_school(name, website)
            
            writer.writerow([
                result['school'],
                result['name'],
                result['title'],
                result['email'],
                result['website'],
                result['notes']
            ])
            
            if result['name']:
                contacts_found += 1
            
            time.sleep(0.3)  # Rate limiting
    
    print(f'\n=== COMPLETE ===')
    print(f'Total processed: {len(schools)}')
    print(f'Contacts found in this batch: {contacts_found}')
    
    return contacts_found

if __name__ == '__main__':
    main()