"""Find why candidates=379 instead of 1267."""
import csv, sqlite3
from pathlib import Path

BASE = Path(__file__).parent
DB   = BASE / "data/processed/aima_conversion_shadow.sqlite"
DATASET = "lost_shops_202605191501"

# Replicate collect_used exactly
used_leads, used_phones = set(), set()
for p in sorted((BASE / "data/processed").glob("*.csv")):
    try:
        with p.open(encoding="utf-8-sig") as f:
            for row in csv.DictReader(f):
                lid = str(row.get("lead_id", "") or "").strip()
                ph  = str(row.get("phone",   "") or "").strip()
                if lid: used_leads.add(lid)
                if ph:  used_phones.add(ph)
    except Exception as e:
        print(f"Error reading {p.name}: {e}")

print(f"used_leads={len(used_leads)} used_phones={len(used_phones)}")

conn = sqlite3.connect(str(DB))
all_rows = conn.execute(
    "SELECT lead_id, phone FROM aima_imported_contacts WHERE dataset=? ORDER BY row_index DESC",
    (DATASET,)
).fetchall()
conn.close()

# Analyze exclusion reasons
excl_lead = excl_phone = excl_both = kept = 0
for r in all_rows:
    lid, ph = str(r[0]), str(r[1] or "")
    by_lead  = lid in used_leads
    by_phone = ph in used_phones
    if by_lead and by_phone: excl_both += 1
    elif by_lead:             excl_lead += 1
    elif by_phone:            excl_phone += 1
    else:                     kept += 1

print(f"\nOf {len(all_rows)} total in DB:")
print(f"  excluded by lead_id only: {excl_lead}")
print(f"  excluded by phone only:   {excl_phone}")
print(f"  excluded by both:         {excl_both}")
print(f"  KEPT (candidates):        {kept}")

# Which phones from lost_shops are in used_phones?
conn2 = sqlite3.connect(str(DB))
ls_phones = {r[0] for r in conn2.execute(
    "SELECT phone FROM aima_imported_contacts WHERE dataset=?", (DATASET,)
).fetchall() if r[0]}
conn2.close()

overlap = ls_phones & used_phones
print(f"\nPhones overlap (lost_shops ∩ used_phones): {len(overlap)}")

# Which CSVs contribute those overlap phones?
print("\nCSV files contributing phone overlaps:")
for p in sorted((BASE / "data/processed").glob("*.csv")):
    try:
        with p.open(encoding="utf-8-sig") as f:
            file_phones = {str(row.get("phone","") or "").strip() for row in csv.DictReader(f)}
        hits = file_phones & overlap
        if hits:
            print(f"  {p.name}: {len(hits)} phones that overlap with lost_shops")
    except: pass
