Dictionary Audit
Audits dictionary coverage: missing fields, duplicate slugs, orphaned links, weak terms.
1"""Dictionary coverage audit: identifies terms needing improvement. 2 3Checks for: 4- Missing fields (significance, source_documents, etc.) 5- Terms stuck at DRAFT status 6- Duplicate slugs 7- Orphaned links (links to nonexistent terms) 8- Terms with no related links 9- Coverage by category 10 11Outputs JSON report to staging/dictionary_audit.json and console summary. 12""" 13 14import sqlite3 15import json 16from pathlib import Path 17 18BASE_DIR = Path(__file__).resolve().parent.parent 19DB_PATH = BASE_DIR / "db" / "hp.db" 20STAGING_DIR = BASE_DIR / "staging" 21 22 23def audit(): 24 conn = sqlite3.connect(DB_PATH) 25 conn.row_factory = sqlite3.Row 26 cur = conn.cursor() 27 28 report = { 29 'total_terms': 0, 30 'by_status': {}, 31 'by_category': {}, 32 'missing_fields': [], 33 'duplicate_slugs': [], 34 'orphaned_links': [], 35 'terms_without_links': [], 36 'weak_terms': [], 37 'summary': {}, 38 } 39 40 # Total terms 41 cur.execute("SELECT COUNT(*) FROM dictionary_terms") 42 report['total_terms'] = cur.fetchone()[0] 43 44 # By status 45 cur.execute("SELECT review_status, COUNT(*) FROM dictionary_terms GROUP BY review_status") 46 report['by_status'] = {row[0]: row[1] for row in cur.fetchall()} 47 48 # By category 49 cur.execute("SELECT category, COUNT(*) FROM dictionary_terms GROUP BY category ORDER BY COUNT(*) DESC") 50 report['by_category'] = {row[0]: row[1] for row in cur.fetchall()} 51 52 # Check for missing fields on each term 53 cur.execute(""" 54 SELECT slug, label, category, definition_short, definition_long, 55 source_basis, review_status, significance_to_hp, 56 significance_to_scholarship, source_documents, source_page_refs, 57 source_method, confidence 58 FROM dictionary_terms ORDER BY slug 59 """) 60 61 important_fields = [ 62 'definition_long', 'source_basis', 'significance_to_hp', 63 'significance_to_scholarship', 'source_documents' 64 ] 65 66 for row in cur.fetchall(): 67 missing = [] 68 for field in important_fields: 69 val = row[field] if field in row.keys() else None 70 if not val or (isinstance(val, str) and val.strip() == ''): 71 missing.append(field) 72 73 if missing: 74 report['missing_fields'].append({ 75 'slug': row['slug'], 76 'label': row['label'], 77 'category': row['category'], 78 'review_status': row['review_status'], 79 'missing': missing, 80 }) 81 82 # "Weak" = DRAFT + missing 2+ important fields 83 if row['review_status'] == 'DRAFT' and len(missing) >= 2: 84 report['weak_terms'].append({ 85 'slug': row['slug'], 86 'label': row['label'], 87 'missing_count': len(missing), 88 'missing': missing, 89 }) 90 91 # Duplicate slugs 92 cur.execute(""" 93 SELECT slug, COUNT(*) FROM dictionary_terms 94 GROUP BY slug HAVING COUNT(*) > 1 95 """) 96 report['duplicate_slugs'] = [row[0] for row in cur.fetchall()] 97 98 # Orphaned links 99 cur.execute(""" 100 SELECT l.id, l.term_id, l.linked_term_id 101 FROM dictionary_term_links l 102 LEFT JOIN dictionary_terms t1 ON l.term_id = t1.id 103 LEFT JOIN dictionary_terms t2 ON l.linked_term_id = t2.id 104 WHERE t1.id IS NULL OR t2.id IS NULL 105 """) 106 report['orphaned_links'] = [dict(row) for row in cur.fetchall()] 107 108 # Terms without any links 109 cur.execute(""" 110 SELECT t.slug, t.label FROM dictionary_terms t 111 LEFT JOIN dictionary_term_links l ON t.id = l.term_id 112 WHERE l.id IS NULL 113 """) 114 report['terms_without_links'] = [{'slug': row[0], 'label': row[1]} for row in cur.fetchall()] 115 116 # Summary 117 report['summary'] = { 118 'total_terms': report['total_terms'], 119 'draft_count': report['by_status'].get('DRAFT', 0), 120 'verified_count': report['by_status'].get('VERIFIED', 0), 121 'terms_with_missing_fields': len(report['missing_fields']), 122 'weak_terms_count': len(report['weak_terms']), 123 'duplicate_slugs_count': len(report['duplicate_slugs']), 124 'orphaned_links_count': len(report['orphaned_links']), 125 'terms_without_links_count': len(report['terms_without_links']), 126 'categories': len(report['by_category']), 127 } 128 129 conn.close() 130 131 # Write report 132 STAGING_DIR.mkdir(exist_ok=True) 133 report_path = STAGING_DIR / "dictionary_audit.json" 134 with open(report_path, 'w', encoding='utf-8') as f: 135 json.dump(report, f, indent=2, ensure_ascii=False) 136 137 # Console output 138 print("=== Dictionary Audit ===\n") 139 print(f"Total terms: {report['total_terms']}") 140 print(f"By status: {report['by_status']}") 141 print(f"Categories: {len(report['by_category'])}") 142 for cat, count in report['by_category'].items(): 143 print(f" {cat}: {count}") 144 print(f"\nTerms with missing important fields: {len(report['missing_fields'])}") 145 print(f"Weak terms (DRAFT + 2+ missing): {len(report['weak_terms'])}") 146 if report['weak_terms']: 147 for wt in report['weak_terms']: 148 print(f" - {wt['slug']}: missing {', '.join(wt['missing'])}") 149 print(f"Duplicate slugs: {len(report['duplicate_slugs'])}") 150 print(f"Orphaned links: {len(report['orphaned_links'])}") 151 print(f"Terms without links: {len(report['terms_without_links'])}") 152 if report['terms_without_links']: 153 for t in report['terms_without_links']: 154 print(f" - {t['slug']}") 155 print(f"\nReport written to: {report_path}") 156 return report 157 158 159if __name__ == "__main__": 160 audit()