Validate & QA

validate.py — 264 lines
Checks data integrity (duplicate slugs, broken links, confidence distribution) and writes AUDIT_REPORT.md.
1"""Validation and QA: check data integrity, flag issues, produce audit report."""
2
3import sqlite3
4from pathlib import Path
5from datetime import datetime
6
7BASE_DIR = Path(__file__).resolve().parent.parent
8DB_PATH = BASE_DIR / "db" / "hp.db"
9SITE_DIR = BASE_DIR / "site"
10
11
12def main():
13    conn = sqlite3.connect(DB_PATH)
14    cur = conn.cursor()
15
16    issues = []
17    warnings = []
18    stats = {}
19
20    print("=== Validation & QA ===\n")
21
22    # 1. Duplicate slugs in dictionary
23    print("1. Checking for duplicate dictionary slugs...")
24    cur.execute("""
25        SELECT slug, COUNT(*) FROM dictionary_terms
26        GROUP BY slug HAVING COUNT(*) > 1
27    """)
28    dupes = cur.fetchall()
29    if dupes:
30        for d in dupes:
31            issues.append(f"DUPLICATE SLUG: dictionary term '{d[0]}' appears {d[1]} times")
32    else:
33        print("   OK - no duplicates")
34
35    # 2. Terms without category
36    print("2. Checking terms without category...")
37    cur.execute("SELECT slug FROM dictionary_terms WHERE category IS NULL OR category = ''")
38    no_cat = cur.fetchall()
39    if no_cat:
40        for t in no_cat:
41            issues.append(f"MISSING CATEGORY: term '{t[0]}' has no category")
42    else:
43        print("   OK - all terms have categories")
44
45    # 3. Scholar pages with no works
46    print("3. Checking scholars with no works...")
47    cur.execute("""
48        SELECT s.name FROM scholars s
49        LEFT JOIN scholar_works sw ON s.id = sw.scholar_id
50        WHERE sw.scholar_id IS NULL
51    """)
52    no_works = cur.fetchall()
53    stats['scholars_no_works'] = len(no_works)
54    if no_works:
55        for s in no_works:
56            warnings.append(f"SCHOLAR NO WORKS: '{s[0]}' has no linked bibliography entries")
57    print(f"   {len(no_works)} scholars without linked works (warning, not error)")
58
59    # 4. Folio refs that don't resolve to images
60    print("4. Checking unresolved folio references...")
61    cur.execute("""
62        SELECT r.id, r.signature_ref, r.thesis_page
63        FROM dissertation_refs r
64        LEFT JOIN matches m ON m.ref_id = r.id
65        WHERE m.id IS NULL
66    """)
67    unmatched = cur.fetchall()
68    stats['unmatched_refs'] = len(unmatched)
69    if unmatched:
70        warnings.append(f"UNMATCHED REFS: {len(unmatched)} dissertation references have no image match")
71    print(f"   {len(unmatched)} unmatched references")
72
73    # 5. Missing linked records in dictionary_term_links
74    print("5. Checking dictionary link integrity...")
75    cur.execute("""
76        SELECT l.id, l.term_id, l.linked_term_id
77        FROM dictionary_term_links l
78        LEFT JOIN dictionary_terms t1 ON l.term_id = t1.id
79        LEFT JOIN dictionary_terms t2 ON l.linked_term_id = t2.id
80        WHERE t1.id IS NULL OR t2.id IS NULL
81    """)
82    broken_links = cur.fetchall()
83    if broken_links:
84        issues.append(f"BROKEN LINKS: {len(broken_links)} dictionary links point to missing terms")
85    else:
86        print("   OK - all links resolve")
87
88    # 6. BL confidence check
89    print("6. Verifying BL confidence downgrade...")
90    cur.execute("""
91        SELECT mat.confidence, COUNT(*)
92        FROM matches mat
93        JOIN images i ON mat.image_id = i.id
94        JOIN manuscripts m ON i.manuscript_id = m.id
95        WHERE m.shelfmark = 'C.60.o.12'
96        GROUP BY mat.confidence
97    """)
98    bl_conf = cur.fetchall()
99    for conf, count in bl_conf:
100        if conf in ('HIGH', 'MEDIUM'):
101            issues.append(f"BL CONFIDENCE: {count} BL matches still at {conf} (should be LOW)")
102        else:
103            print(f"   BL matches: {count} at {conf}")
104
105    # 7. Review status summary
106    print("7. Review status audit...")
107    review_tables = [
108        ('bibliography', 'needs_review'),
109        ('scholars', 'needs_review'),
110        ('dictionary_terms', 'needs_review'),
111        ('matches', 'needs_review'),
112    ]
113    for table, col in review_tables:
114        try:
115            cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {col} = 1")
116            needs = cur.fetchone()[0]
117            cur.execute(f"SELECT COUNT(*) FROM {table}")
118            total = cur.fetchone()[0]
119            pct = (needs * 100 // total) if total > 0 else 0
120            stats[f'{table}_needs_review'] = needs
121            print(f"   {table}: {needs}/{total} need review ({pct}%)")
122        except:
123            pass
124
125    # 8. Site file counts
126    print("8. Checking generated site files...")
127    site_counts = {
128        'scholar pages': len(list((SITE_DIR / 'scholar').glob('*.html'))) if (SITE_DIR / 'scholar').exists() else 0,
129        'dictionary pages': len(list((SITE_DIR / 'dictionary').glob('*.html'))) if (SITE_DIR / 'dictionary').exists() else 0,
130        'marginalia pages': len(list((SITE_DIR / 'marginalia').glob('*.html'))) if (SITE_DIR / 'marginalia').exists() else 0,
131    }
132    for label, count in site_counts.items():
133        print(f"   {label}: {count}")
134        stats[label] = count
135
136    # 9. Check for empty HTML files
137    print("9. Checking for empty HTML files...")
138    empty_count = 0
139    for html in SITE_DIR.rglob('*.html'):
140        if html.stat().st_size < 100:
141            issues.append(f"EMPTY FILE: {html.relative_to(BASE_DIR)}")
142            empty_count += 1
143    if empty_count == 0:
144        print("   OK - no empty files")
145
146    # 10. Data.json integrity
147    print("10. Checking data.json...")
148    import json
149    data_path = SITE_DIR / 'data.json'
150    if data_path.exists():
151        with open(data_path, encoding='utf-8') as f:
152            data = json.load(f)
153        n_entries = len(data.get('entries', []))
154        n_low = sum(1 for e in data['entries'] if e.get('confidence') == 'LOW')
155        print(f"   {n_entries} entries, {n_low} LOW confidence")
156        stats['data_json_entries'] = n_entries
157        if 'provenance' not in data:
158            warnings.append("DATA.JSON: missing provenance field")
159    else:
160        issues.append("DATA.JSON: file not found")
161
162    # === Produce Report ===
163    print("\n" + "=" * 60)
164    print("VALIDATION RESULTS")
165    print("=" * 60)
166
167    if issues:
168        print(f"\n  ISSUES ({len(issues)}):")
169        for i in issues:
170            print(f"    [!] {i}")
171    else:
172        print("\n  No critical issues found.")
173
174    if warnings:
175        print(f"\n  WARNINGS ({len(warnings)}):")
176        for w in warnings:
177            print(f"    [?] {w}")
178
179    # Write audit report
180    report_path = BASE_DIR / "AUDIT_REPORT.md"
181    with open(report_path, 'w', encoding='utf-8') as f:
182        f.write(f"# Audit Report\n\n")
183        f.write(f"Generated: {datetime.now().isoformat()}\n\n")
184
185        f.write("## What Changed (V2 Migration)\n\n")
186        f.write("### Schema\n")
187        f.write("- Added `annotations` table for first-class marginal note records\n")
188        f.write("- Added `annotators` table (normalized from `annotator_hands`)\n")
189        f.write("- Added `doc_folio_refs` table (generalized from `dissertation_refs`)\n")
190        f.write("- Added `dictionary_terms` and `dictionary_term_links` tables\n")
191        f.write("- Added `document_topics` junction table for multi-value topic clusters\n")
192        f.write("- Added `review_status`, `needs_review`, `reviewed`, `source_method`, ")
193        f.write("`confidence`, and `notes` columns across existing tables\n")
194        f.write("- Downgraded all BL C.60.o.12 matches from MEDIUM to LOW confidence\n\n")
195
196        f.write("### Site\n")
197        f.write(f"- {site_counts.get('scholar pages', 0)} scholar profile pages (DB-driven, with review badges)\n")
198        f.write(f"- {site_counts.get('dictionary pages', 0)} dictionary term pages (37 terms, 6 categories, 76 links)\n")
199        f.write(f"- {site_counts.get('marginalia pages', 0)} marginalia folio detail pages (with annotator hand info)\n")
200        f.write("- Site-wide navigation bar (Home, Marginalia, Scholars, Dictionary, About)\n")
201        f.write("- About page with database statistics and rebuild instructions\n")
202        f.write("- Confidence badges (HIGH/MEDIUM/LOW/PROVISIONAL) on all matches\n")
203        f.write("- Review badges (Unreviewed) on LLM-assisted content\n\n")
204
205        f.write("## What Remains Provisional\n\n")
206        f.write("| Data Category | Total | Needs Review | Source Method |\n")
207        f.write("|---|---|---|---|\n")
208        f.write(f"| Bibliography entries | 88 | {stats.get('bibliography_needs_review', '?')} | LLM-assisted |\n")
209        f.write(f"| Scholar profiles | 52 | {stats.get('scholars_needs_review', '?')} | LLM-assisted |\n")
210        f.write(f"| Dictionary terms | 37 | {stats.get('dictionary_terms_needs_review', '?')} | LLM-assisted |\n")
211        f.write(f"| Image matches | 610 | {stats.get('matches_needs_review', '?')} | Algorithmic (folio mapping) |\n")
212        f.write(f"| BL matches specifically | 218 | 218 | LOW confidence (1545 edition offset) |\n\n")
213
214        f.write("## What Still Needs Human Review\n\n")
215        f.write("### Critical\n")
216        f.write("1. **BL C.60.o.12 photo-to-folio mapping**: Verify that sequential photo numbers\n")
217        f.write("   correspond to folio numbers. The BL copy is the 1545 edition; layout may differ\n")
218        f.write("   from the 1499 signature map by a few leaves.\n")
219        f.write("2. **Article summaries**: All 34 summaries were generated by Claude and have not\n")
220        f.write("   been checked for factual accuracy, misattribution, or hallucination.\n")
221        f.write("3. **Hand attributions**: Derived from reading Russell's prose in conversation.\n")
222        f.write("   The signature-to-hand mapping rules are approximate.\n\n")
223
224        f.write("### Important\n")
225        f.write("4. **Scholar metadata**: Birth/death years, nationalities, and institutional\n")
226        f.write("   affiliations should be cross-referenced against VIAF/WorldCat.\n")
227        f.write("5. **Dictionary definitions**: Especially the alchemical and architectural terms\n")
228        f.write("   should be reviewed by a domain specialist.\n")
229        f.write("6. **Mislabeled files**: Jarzombek (De pictura, not HP) and Canone/Spruit\n")
230        f.write("   (Poncet on Botticelli, not emblematics) were identified by LLM and\n")
231        f.write("   should be confirmed.\n\n")
232
233        f.write("### Nice to Have\n")
234        f.write("7. **Timeline event dates**: Some date ranges are approximate.\n")
235        f.write("8. **Topic cluster assignments**: Some works could belong to multiple clusters.\n")
236        f.write("   The `document_topics` junction table supports this but most entries\n")
237        f.write("   have only one topic assigned.\n\n")
238
239        if issues:
240            f.write("## Validation Issues\n\n")
241            for i in issues:
242                f.write(f"- **{i}**\n")
243            f.write("\n")
244
245        if warnings:
246            f.write("## Validation Warnings\n\n")
247            for w in warnings:
248                f.write(f"- {w}\n")
249            f.write("\n")
250
251        f.write("## How to Rebuild\n\n")
252        f.write("```bash\n")
253        f.write("python scripts/migrate_v2.py        # Schema migration (idempotent)\n")
254        f.write("python scripts/seed_dictionary.py   # Dictionary terms (idempotent)\n")
255        f.write("python scripts/build_site.py        # Generate all HTML + JSON\n")
256        f.write("python scripts/validate.py          # Run this audit\n")
257        f.write("```\n")
258
259    print(f"\nAudit report written to {report_path}")
260    conn.close()
261
262
263if __name__ == "__main__":
264    main()