#!/usr/bin/env python3
import csv

# Get ALL schools with websites from the original CSV - no dedup
all_schools = []
with open('schools_waterloo_council.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        if len(row) < 2:
            continue
        
        school_name = row[0].strip() if row[0] else ""
        
        # Find website
        website = ""
        for col in row:
            col = col.strip()
            if col.startswith('http://') or col.startswith('https://') or col.startswith('www.'):
                website = col
                break
        
        if school_name and website:
            all_schools.append({
                'name': school_name,
                'website': website
            })

# Get already scraped - from column 5 (Website) in results
scraped = set()
with open('scraped_contacts_results.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if len(row) >= 5 and row[4]:
            website = row[4].strip().strip('"')
            if website.startswith('http') or website.startswith('www'):
                scraped.add(website)

# Also check school names in column 1
with open('scraped_contacts_results.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if row and row[0]:
            scraped.add(row[0].strip())

print(f"Total schools with websites: {len(all_schools)}")
print(f"Scraped entries: {len(scraped)}")

# Filter out already scraped
to_scrape = []
for s in all_schools:
    # Check by website
    if s['website'] not in scraped and s['name'] not in scraped:
        to_scrape.append(s)

print(f"To scrape: {len(to_scrape)}")
print("\nFirst 30:")
for s in to_scrape[:30]:
    print(f"{s['name']}: {s['website']}")