Validate & QA
Checks data integrity (duplicate slugs, broken links, confidence distribution) and writes AUDIT_REPORT.md.
1"""Validation and QA: check data integrity, flag issues, produce audit report.""" 2 3import sqlite3 4from pathlib import Path 5from datetime import datetime 6 7BASE_DIR = Path(__file__).resolve().parent.parent 8DB_PATH = BASE_DIR / "db" / "hp.db" 9SITE_DIR = BASE_DIR / "site" 10 11 12def main(): 13 conn = sqlite3.connect(DB_PATH) 14 cur = conn.cursor() 15 16 issues = [] 17 warnings = [] 18 stats = {} 19 20 print("=== Validation & QA ===\n") 21 22 # 1. Duplicate slugs in dictionary 23 print("1. Checking for duplicate dictionary slugs...") 24 cur.execute(""" 25 SELECT slug, COUNT(*) FROM dictionary_terms 26 GROUP BY slug HAVING COUNT(*) > 1 27 """) 28 dupes = cur.fetchall() 29 if dupes: 30 for d in dupes: 31 issues.append(f"DUPLICATE SLUG: dictionary term '{d[0]}' appears {d[1]} times") 32 else: 33 print(" OK - no duplicates") 34 35 # 2. Terms without category 36 print("2. Checking terms without category...") 37 cur.execute("SELECT slug FROM dictionary_terms WHERE category IS NULL OR category = ''") 38 no_cat = cur.fetchall() 39 if no_cat: 40 for t in no_cat: 41 issues.append(f"MISSING CATEGORY: term '{t[0]}' has no category") 42 else: 43 print(" OK - all terms have categories") 44 45 # 3. Scholar pages with no works 46 print("3. Checking scholars with no works...") 47 cur.execute(""" 48 SELECT s.name FROM scholars s 49 LEFT JOIN scholar_works sw ON s.id = sw.scholar_id 50 WHERE sw.scholar_id IS NULL 51 """) 52 no_works = cur.fetchall() 53 stats['scholars_no_works'] = len(no_works) 54 if no_works: 55 for s in no_works: 56 warnings.append(f"SCHOLAR NO WORKS: '{s[0]}' has no linked bibliography entries") 57 print(f" {len(no_works)} scholars without linked works (warning, not error)") 58 59 # 4. Folio refs that don't resolve to images 60 print("4. Checking unresolved folio references...") 61 cur.execute(""" 62 SELECT r.id, r.signature_ref, r.thesis_page 63 FROM dissertation_refs r 64 LEFT JOIN matches m ON m.ref_id = r.id 65 WHERE m.id IS NULL 66 """) 67 unmatched = cur.fetchall() 68 stats['unmatched_refs'] = len(unmatched) 69 if unmatched: 70 warnings.append(f"UNMATCHED REFS: {len(unmatched)} dissertation references have no image match") 71 print(f" {len(unmatched)} unmatched references") 72 73 # 5. Missing linked records in dictionary_term_links 74 print("5. Checking dictionary link integrity...") 75 cur.execute(""" 76 SELECT l.id, l.term_id, l.linked_term_id 77 FROM dictionary_term_links l 78 LEFT JOIN dictionary_terms t1 ON l.term_id = t1.id 79 LEFT JOIN dictionary_terms t2 ON l.linked_term_id = t2.id 80 WHERE t1.id IS NULL OR t2.id IS NULL 81 """) 82 broken_links = cur.fetchall() 83 if broken_links: 84 issues.append(f"BROKEN LINKS: {len(broken_links)} dictionary links point to missing terms") 85 else: 86 print(" OK - all links resolve") 87 88 # 6. BL confidence check 89 print("6. Verifying BL confidence downgrade...") 90 cur.execute(""" 91 SELECT mat.confidence, COUNT(*) 92 FROM matches mat 93 JOIN images i ON mat.image_id = i.id 94 JOIN manuscripts m ON i.manuscript_id = m.id 95 WHERE m.shelfmark = 'C.60.o.12' 96 GROUP BY mat.confidence 97 """) 98 bl_conf = cur.fetchall() 99 for conf, count in bl_conf: 100 if conf in ('HIGH', 'MEDIUM'): 101 issues.append(f"BL CONFIDENCE: {count} BL matches still at {conf} (should be LOW)") 102 else: 103 print(f" BL matches: {count} at {conf}") 104 105 # 7. Review status summary 106 print("7. Review status audit...") 107 review_tables = [ 108 ('bibliography', 'needs_review'), 109 ('scholars', 'needs_review'), 110 ('dictionary_terms', 'needs_review'), 111 ('matches', 'needs_review'), 112 ] 113 for table, col in review_tables: 114 try: 115 cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {col} = 1") 116 needs = cur.fetchone()[0] 117 cur.execute(f"SELECT COUNT(*) FROM {table}") 118 total = cur.fetchone()[0] 119 pct = (needs * 100 // total) if total > 0 else 0 120 stats[f'{table}_needs_review'] = needs 121 print(f" {table}: {needs}/{total} need review ({pct}%)") 122 except: 123 pass 124 125 # 8. Site file counts 126 print("8. Checking generated site files...") 127 site_counts = { 128 'scholar pages': len(list((SITE_DIR / 'scholar').glob('*.html'))) if (SITE_DIR / 'scholar').exists() else 0, 129 'dictionary pages': len(list((SITE_DIR / 'dictionary').glob('*.html'))) if (SITE_DIR / 'dictionary').exists() else 0, 130 'marginalia pages': len(list((SITE_DIR / 'marginalia').glob('*.html'))) if (SITE_DIR / 'marginalia').exists() else 0, 131 } 132 for label, count in site_counts.items(): 133 print(f" {label}: {count}") 134 stats[label] = count 135 136 # 9. Check for empty HTML files 137 print("9. Checking for empty HTML files...") 138 empty_count = 0 139 for html in SITE_DIR.rglob('*.html'): 140 if html.stat().st_size < 100: 141 issues.append(f"EMPTY FILE: {html.relative_to(BASE_DIR)}") 142 empty_count += 1 143 if empty_count == 0: 144 print(" OK - no empty files") 145 146 # 10. Data.json integrity 147 print("10. Checking data.json...") 148 import json 149 data_path = SITE_DIR / 'data.json' 150 if data_path.exists(): 151 with open(data_path, encoding='utf-8') as f: 152 data = json.load(f) 153 n_entries = len(data.get('entries', [])) 154 n_low = sum(1 for e in data['entries'] if e.get('confidence') == 'LOW') 155 print(f" {n_entries} entries, {n_low} LOW confidence") 156 stats['data_json_entries'] = n_entries 157 if 'provenance' not in data: 158 warnings.append("DATA.JSON: missing provenance field") 159 else: 160 issues.append("DATA.JSON: file not found") 161 162 # === Produce Report === 163 print("\n" + "=" * 60) 164 print("VALIDATION RESULTS") 165 print("=" * 60) 166 167 if issues: 168 print(f"\n ISSUES ({len(issues)}):") 169 for i in issues: 170 print(f" [!] {i}") 171 else: 172 print("\n No critical issues found.") 173 174 if warnings: 175 print(f"\n WARNINGS ({len(warnings)}):") 176 for w in warnings: 177 print(f" [?] {w}") 178 179 # Write audit report 180 report_path = BASE_DIR / "AUDIT_REPORT.md" 181 with open(report_path, 'w', encoding='utf-8') as f: 182 f.write(f"# Audit Report\n\n") 183 f.write(f"Generated: {datetime.now().isoformat()}\n\n") 184 185 f.write("## What Changed (V2 Migration)\n\n") 186 f.write("### Schema\n") 187 f.write("- Added `annotations` table for first-class marginal note records\n") 188 f.write("- Added `annotators` table (normalized from `annotator_hands`)\n") 189 f.write("- Added `doc_folio_refs` table (generalized from `dissertation_refs`)\n") 190 f.write("- Added `dictionary_terms` and `dictionary_term_links` tables\n") 191 f.write("- Added `document_topics` junction table for multi-value topic clusters\n") 192 f.write("- Added `review_status`, `needs_review`, `reviewed`, `source_method`, ") 193 f.write("`confidence`, and `notes` columns across existing tables\n") 194 f.write("- Downgraded all BL C.60.o.12 matches from MEDIUM to LOW confidence\n\n") 195 196 f.write("### Site\n") 197 f.write(f"- {site_counts.get('scholar pages', 0)} scholar profile pages (DB-driven, with review badges)\n") 198 f.write(f"- {site_counts.get('dictionary pages', 0)} dictionary term pages (37 terms, 6 categories, 76 links)\n") 199 f.write(f"- {site_counts.get('marginalia pages', 0)} marginalia folio detail pages (with annotator hand info)\n") 200 f.write("- Site-wide navigation bar (Home, Marginalia, Scholars, Dictionary, About)\n") 201 f.write("- About page with database statistics and rebuild instructions\n") 202 f.write("- Confidence badges (HIGH/MEDIUM/LOW/PROVISIONAL) on all matches\n") 203 f.write("- Review badges (Unreviewed) on LLM-assisted content\n\n") 204 205 f.write("## What Remains Provisional\n\n") 206 f.write("| Data Category | Total | Needs Review | Source Method |\n") 207 f.write("|---|---|---|---|\n") 208 f.write(f"| Bibliography entries | 88 | {stats.get('bibliography_needs_review', '?')} | LLM-assisted |\n") 209 f.write(f"| Scholar profiles | 52 | {stats.get('scholars_needs_review', '?')} | LLM-assisted |\n") 210 f.write(f"| Dictionary terms | 37 | {stats.get('dictionary_terms_needs_review', '?')} | LLM-assisted |\n") 211 f.write(f"| Image matches | 610 | {stats.get('matches_needs_review', '?')} | Algorithmic (folio mapping) |\n") 212 f.write(f"| BL matches specifically | 218 | 218 | LOW confidence (1545 edition offset) |\n\n") 213 214 f.write("## What Still Needs Human Review\n\n") 215 f.write("### Critical\n") 216 f.write("1. **BL C.60.o.12 photo-to-folio mapping**: Verify that sequential photo numbers\n") 217 f.write(" correspond to folio numbers. The BL copy is the 1545 edition; layout may differ\n") 218 f.write(" from the 1499 signature map by a few leaves.\n") 219 f.write("2. **Article summaries**: All 34 summaries were generated by Claude and have not\n") 220 f.write(" been checked for factual accuracy, misattribution, or hallucination.\n") 221 f.write("3. **Hand attributions**: Derived from reading Russell's prose in conversation.\n") 222 f.write(" The signature-to-hand mapping rules are approximate.\n\n") 223 224 f.write("### Important\n") 225 f.write("4. **Scholar metadata**: Birth/death years, nationalities, and institutional\n") 226 f.write(" affiliations should be cross-referenced against VIAF/WorldCat.\n") 227 f.write("5. **Dictionary definitions**: Especially the alchemical and architectural terms\n") 228 f.write(" should be reviewed by a domain specialist.\n") 229 f.write("6. **Mislabeled files**: Jarzombek (De pictura, not HP) and Canone/Spruit\n") 230 f.write(" (Poncet on Botticelli, not emblematics) were identified by LLM and\n") 231 f.write(" should be confirmed.\n\n") 232 233 f.write("### Nice to Have\n") 234 f.write("7. **Timeline event dates**: Some date ranges are approximate.\n") 235 f.write("8. **Topic cluster assignments**: Some works could belong to multiple clusters.\n") 236 f.write(" The `document_topics` junction table supports this but most entries\n") 237 f.write(" have only one topic assigned.\n\n") 238 239 if issues: 240 f.write("## Validation Issues\n\n") 241 for i in issues: 242 f.write(f"- **{i}**\n") 243 f.write("\n") 244 245 if warnings: 246 f.write("## Validation Warnings\n\n") 247 for w in warnings: 248 f.write(f"- {w}\n") 249 f.write("\n") 250 251 f.write("## How to Rebuild\n\n") 252 f.write("```bash\n") 253 f.write("python scripts/migrate_v2.py # Schema migration (idempotent)\n") 254 f.write("python scripts/seed_dictionary.py # Dictionary terms (idempotent)\n") 255 f.write("python scripts/build_site.py # Generate all HTML + JSON\n") 256 f.write("python scripts/validate.py # Run this audit\n") 257 f.write("```\n") 258 259 print(f"\nAudit report written to {report_path}") 260 conn.close() 261 262 263if __name__ == "__main__": 264 main()