Alchemical Hands in the Hypnerotomachia Poliphili

Marginalia, Scholarship & Reception

← All Scripts

Dictionary Audit

dictionary_audit.py — 160 lines

Audits dictionary coverage: missing fields, duplicate slugs, orphaned links, weak terms.

1"""Dictionary coverage audit: identifies terms needing improvement.
2
3Checks for:
4- Missing fields (significance, source_documents, etc.)
5- Terms stuck at DRAFT status
6- Duplicate slugs
7- Orphaned links (links to nonexistent terms)
8- Terms with no related links
9- Coverage by category
10
11Outputs JSON report to staging/dictionary_audit.json and console summary.
12"""
13
14import sqlite3
15import json
16from pathlib import Path
17
18BASE_DIR = Path(__file__).resolve().parent.parent
19DB_PATH = BASE_DIR / "db" / "hp.db"
20STAGING_DIR = BASE_DIR / "staging"
21
22
23def audit():
24    conn = sqlite3.connect(DB_PATH)
25    conn.row_factory = sqlite3.Row
26    cur = conn.cursor()
27
28    report = {
29        'total_terms': 0,
30        'by_status': {},
31        'by_category': {},
32        'missing_fields': [],
33        'duplicate_slugs': [],
34        'orphaned_links': [],
35        'terms_without_links': [],
36        'weak_terms': [],
37        'summary': {},
38    }
39
40    # Total terms
41    cur.execute("SELECT COUNT(*) FROM dictionary_terms")
42    report['total_terms'] = cur.fetchone()[0]
43
44    # By status
45    cur.execute("SELECT review_status, COUNT(*) FROM dictionary_terms GROUP BY review_status")
46    report['by_status'] = {row[0]: row[1] for row in cur.fetchall()}
47
48    # By category
49    cur.execute("SELECT category, COUNT(*) FROM dictionary_terms GROUP BY category ORDER BY COUNT(*) DESC")
50    report['by_category'] = {row[0]: row[1] for row in cur.fetchall()}
51
52    # Check for missing fields on each term
53    cur.execute("""
54        SELECT slug, label, category, definition_short, definition_long,
55               source_basis, review_status, significance_to_hp,
56               significance_to_scholarship, source_documents, source_page_refs,
57               source_method, confidence
58        FROM dictionary_terms ORDER BY slug
59    """)
60
61    important_fields = [
62        'definition_long', 'source_basis', 'significance_to_hp',
63        'significance_to_scholarship', 'source_documents'
64    ]
65
66    for row in cur.fetchall():
67        missing = []
68        for field in important_fields:
69            val = row[field] if field in row.keys() else None
70            if not val or (isinstance(val, str) and val.strip() == ''):
71                missing.append(field)
72
73        if missing:
74            report['missing_fields'].append({
75                'slug': row['slug'],
76                'label': row['label'],
77                'category': row['category'],
78                'review_status': row['review_status'],
79                'missing': missing,
80            })
81
82        # "Weak" = DRAFT + missing 2+ important fields
83        if row['review_status'] == 'DRAFT' and len(missing) >= 2:
84            report['weak_terms'].append({
85                'slug': row['slug'],
86                'label': row['label'],
87                'missing_count': len(missing),
88                'missing': missing,
89            })
90
91    # Duplicate slugs
92    cur.execute("""
93        SELECT slug, COUNT(*) FROM dictionary_terms
94        GROUP BY slug HAVING COUNT(*) > 1
95    """)
96    report['duplicate_slugs'] = [row[0] for row in cur.fetchall()]
97
98    # Orphaned links
99    cur.execute("""
100        SELECT l.id, l.term_id, l.linked_term_id
101        FROM dictionary_term_links l
102        LEFT JOIN dictionary_terms t1 ON l.term_id = t1.id
103        LEFT JOIN dictionary_terms t2 ON l.linked_term_id = t2.id
104        WHERE t1.id IS NULL OR t2.id IS NULL
105    """)
106    report['orphaned_links'] = [dict(row) for row in cur.fetchall()]
107
108    # Terms without any links
109    cur.execute("""
110        SELECT t.slug, t.label FROM dictionary_terms t
111        LEFT JOIN dictionary_term_links l ON t.id = l.term_id
112        WHERE l.id IS NULL
113    """)
114    report['terms_without_links'] = [{'slug': row[0], 'label': row[1]} for row in cur.fetchall()]
115
116    # Summary
117    report['summary'] = {
118        'total_terms': report['total_terms'],
119        'draft_count': report['by_status'].get('DRAFT', 0),
120        'verified_count': report['by_status'].get('VERIFIED', 0),
121        'terms_with_missing_fields': len(report['missing_fields']),
122        'weak_terms_count': len(report['weak_terms']),
123        'duplicate_slugs_count': len(report['duplicate_slugs']),
124        'orphaned_links_count': len(report['orphaned_links']),
125        'terms_without_links_count': len(report['terms_without_links']),
126        'categories': len(report['by_category']),
127    }
128
129    conn.close()
130
131    # Write report
132    STAGING_DIR.mkdir(exist_ok=True)
133    report_path = STAGING_DIR / "dictionary_audit.json"
134    with open(report_path, 'w', encoding='utf-8') as f:
135        json.dump(report, f, indent=2, ensure_ascii=False)
136
137    # Console output
138    print("=== Dictionary Audit ===\n")
139    print(f"Total terms: {report['total_terms']}")
140    print(f"By status: {report['by_status']}")
141    print(f"Categories: {len(report['by_category'])}")
142    for cat, count in report['by_category'].items():
143        print(f"  {cat}: {count}")
144    print(f"\nTerms with missing important fields: {len(report['missing_fields'])}")
145    print(f"Weak terms (DRAFT + 2+ missing): {len(report['weak_terms'])}")
146    if report['weak_terms']:
147        for wt in report['weak_terms']:
148            print(f"  - {wt['slug']}: missing {', '.join(wt['missing'])}")
149    print(f"Duplicate slugs: {len(report['duplicate_slugs'])}")
150    print(f"Orphaned links: {len(report['orphaned_links'])}")
151    print(f"Terms without links: {len(report['terms_without_links'])}")
152    if report['terms_without_links']:
153        for t in report['terms_without_links']:
154            print(f"  - {t['slug']}")
155    print(f"\nReport written to: {report_path}")
156    return report
157
158
159if __name__ == "__main__":
160    audit()