#!/usr/bin/env python3
import csv
import re

# Get schools with websites
schools = []
with open('schools_waterloo_council.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        if len(row) < 2:
            continue
        
        school_name = row[0].strip() if row[0] else ""
        
        website = ""
        for col in row:
            col = col.strip()
            if col.startswith('http://') or col.startswith('https://') or col.startswith('www.'):
                website = col
                break
        
        if school_name and website:
            schools.append({
                'name': school_name,
                'website': website
            })

# Get already scraped schools
scraped_schools = set()
with open('scraped_contacts_results.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if row and row[0]:
            scraped_schools.add(row[0].strip())

# Filter to unsraped
unsraped = [s for s in schools if s['name'] not in scraped_schools]

print(f"Total schools with websites: {len(schools)}")
print(f"Already scraped: {len(scraped_schools)}")
print(f"Unscraped: {len(unsraped)}")

# Save unscraped for processing
with open('schools_to_scrape.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['name', 'website'])
    writer.writeheader()
    writer.writerows(unsraped)

print("\nFirst 20 unscraped schools:")
for s in unsraped[:20]:
    print(f"{s['name']}: {s['website']}")