Build Essay Data

build_essay_data.py — 318 lines
Extracts structured evidence from DB and corpus for the Russell and Concordance essays.
1"""Extract and structure evidence for the two essay pages.
2
3Produces:
4- staging/essay_russell.json: evidence for the Russell Alchemical Hands essay
5- staging/essay_concordance.json: evidence for the Concordance Methodology essay
6
7All data is retrieved evidence + DB queries. No generated interpretation.
8"""
9
10import sqlite3
11import json
12from pathlib import Path
13
14BASE_DIR = Path(__file__).resolve().parent.parent
15DB_PATH = BASE_DIR / "db" / "hp.db"
16STAGING_DIR = BASE_DIR / "staging"
17
18import sys
19sys.path.insert(0, str(BASE_DIR / "scripts"))
20from corpus_search import search_chunks
21
22
23def build_russell_essay_data():
24    """Gather evidence for the Russell Alchemical Hands essay."""
25    conn = sqlite3.connect(DB_PATH)
26    conn.row_factory = sqlite3.Row
27    cur = conn.cursor()
28
29    data = {
30        'title': "Russell's Research on the Alchemical Hands",
31        'source_method': 'CORPUS_EXTRACTION',
32        'sections': [],
33    }
34
35    # 1. Get all annotator hands, especially alchemists
36    cur.execute("""
37        SELECT hand_label, manuscript_shelfmark, attribution, is_alchemist,
38               school, language, ink_color, date_range, description, interests
39        FROM annotator_hands ORDER BY manuscript_shelfmark, hand_label
40    """)
41    all_hands = [dict(row) for row in cur.fetchall()]
42    alchemist_hands = [h for h in all_hands if h['is_alchemist']]
43
44    data['sections'].append({
45        'id': 'annotator-hands-overview',
46        'title': 'All Annotator Hands in the Corpus',
47        'evidence_type': 'DB_QUERY',
48        'data': all_hands,
49    })
50
51    data['sections'].append({
52        'id': 'alchemist-hands',
53        'title': 'Alchemist Annotators',
54        'evidence_type': 'DB_QUERY',
55        'data': alchemist_hands,
56    })
57
58    # 2. Get dissertation refs attributed to alchemist hands
59    cur.execute("""
60        SELECT r.signature_ref, r.thesis_page, r.context_text, r.marginal_text,
61               r.chapter_num, r.manuscript_shelfmark,
62               h.hand_label, h.attribution, h.school
63        FROM dissertation_refs r
64        JOIN annotator_hands h ON r.hand_id = h.id
65        WHERE h.is_alchemist = 1
66        ORDER BY r.thesis_page
67    """)
68    alchemist_refs = [dict(row) for row in cur.fetchall()]
69
70    data['sections'].append({
71        'id': 'alchemist-refs',
72        'title': 'Dissertation References to Alchemical Annotations',
73        'evidence_type': 'DB_QUERY',
74        'count': len(alchemist_refs),
75        'data': alchemist_refs[:50],  # cap for file size
76    })
77
78    # 3. Get images matched to alchemist refs
79    cur.execute("""
80        SELECT r.signature_ref, i.filename, COALESCE(i.web_path, i.relative_path) as relative_path,
81               m.shelfmark, mat.confidence,
82               h.hand_label, h.school
83        FROM matches mat
84        JOIN dissertation_refs r ON mat.ref_id = r.id
85        JOIN images i ON mat.image_id = i.id
86        JOIN manuscripts m ON i.manuscript_id = m.id
87        JOIN annotator_hands h ON r.hand_id = h.id
88        WHERE h.is_alchemist = 1
89        ORDER BY mat.confidence DESC, r.signature_ref
90    """)
91    alchemist_images = [dict(row) for row in cur.fetchall()]
92
93    data['sections'].append({
94        'id': 'alchemist-images',
95        'title': 'Images Matched to Alchemical Annotations',
96        'evidence_type': 'DB_QUERY',
97        'count': len(alchemist_images),
98        'data': alchemist_images[:30],
99    })
100
101    # 4. Search corpus for key alchemical evidence
102    searches = [
103        ('BL alchemist Hand B', 'Hand B alchemist BL mercury'),
104        ('Buffalo alchemist Hand E', 'Hand E Buffalo alchemist Geber'),
105        ("d'Espagnet framework", "Espagnet mercury Enchiridion"),
106        ('Master Mercury flyleaf', 'Mercury Magisteri flyleaf verus sensus'),
107        ('Sol Luna Buffalo', 'Sol Luna gold silver Buffalo chess'),
108        ('alchemical ideograms', 'ideogram alchemical symbol sign'),
109    ]
110
111    for label, query in searches:
112        results = search_chunks(query, top_n=5)
113        data['sections'].append({
114            'id': f'corpus-{label.lower().replace(" ", "-")}',
115            'title': f'Corpus Evidence: {label}',
116            'evidence_type': 'CORPUS_SEARCH',
117            'query': query,
118            'results': [{
119                'source_doc': r['source_doc'],
120                'section': r['section'],
121                'page_refs': r['page_refs'],
122                'matched_text': r['matched_text'],
123                'relevance_score': r['relevance_score'],
124            } for r in results],
125        })
126
127    # 5. Match statistics
128    cur.execute("""
129        SELECT mat.confidence, COUNT(*)
130        FROM matches mat
131        JOIN dissertation_refs r ON mat.ref_id = r.id
132        JOIN annotator_hands h ON r.hand_id = h.id
133        WHERE h.is_alchemist = 1
134        GROUP BY mat.confidence
135    """)
136    confidence_dist = {row[0]: row[1] for row in cur.fetchall()}
137
138    data['sections'].append({
139        'id': 'confidence-distribution',
140        'title': 'Confidence Distribution for Alchemist Matches',
141        'evidence_type': 'DB_QUERY',
142        'data': confidence_dist,
143    })
144
145    conn.close()
146
147    out_path = STAGING_DIR / "essay_russell.json"
148    with open(out_path, 'w', encoding='utf-8') as f:
149        json.dump(data, f, indent=2, ensure_ascii=False)
150    print(f"Russell essay data: {out_path}")
151    return data
152
153
154def build_concordance_essay_data():
155    """Gather evidence for the Concordance Methodology essay."""
156    conn = sqlite3.connect(DB_PATH)
157    conn.row_factory = sqlite3.Row
158    cur = conn.cursor()
159
160    data = {
161        'title': 'Concordance Methodology',
162        'source_method': 'DETERMINISTIC',
163        'sections': [],
164    }
165
166    # 1. Manuscript overview
167    cur.execute("SELECT * FROM manuscripts")
168    manuscripts = [dict(row) for row in cur.fetchall()]
169    data['sections'].append({
170        'id': 'manuscripts',
171        'title': 'Manuscripts in the Corpus',
172        'evidence_type': 'DB_QUERY',
173        'data': manuscripts,
174    })
175
176    # 2. Signature map stats
177    cur.execute("SELECT COUNT(*) FROM signature_map")
178    sig_count = cur.fetchone()[0]
179    cur.execute("SELECT MIN(signature), MAX(signature) FROM signature_map")
180    sig_range = cur.fetchone()
181    cur.execute("SELECT quire, COUNT(*) FROM signature_map GROUP BY quire ORDER BY quire")
182    quire_counts = {row[0]: row[1] for row in cur.fetchall()}
183
184    data['sections'].append({
185        'id': 'signature-map',
186        'title': 'Signature Map Statistics',
187        'evidence_type': 'DB_QUERY',
188        'data': {
189            'total_signatures': sig_count,
190            'range': [sig_range[0], sig_range[1]],
191            'quires': quire_counts,
192        },
193    })
194
195    # 3. Dissertation refs stats
196    cur.execute("SELECT COUNT(*) FROM dissertation_refs")
197    ref_count = cur.fetchone()[0]
198    cur.execute("""
199        SELECT ref_type, COUNT(*) FROM dissertation_refs
200        GROUP BY ref_type ORDER BY COUNT(*) DESC
201    """)
202    ref_types = {row[0]: row[1] for row in cur.fetchall()}
203    cur.execute("""
204        SELECT manuscript_shelfmark, COUNT(*) FROM dissertation_refs
205        WHERE manuscript_shelfmark IS NOT NULL
206        GROUP BY manuscript_shelfmark ORDER BY COUNT(*) DESC
207    """)
208    ref_by_ms = {row[0]: row[1] for row in cur.fetchall()}
209
210    data['sections'].append({
211        'id': 'dissertation-refs',
212        'title': 'Dissertation Reference Extraction',
213        'evidence_type': 'DB_QUERY',
214        'data': {
215            'total_refs': ref_count,
216            'by_type': ref_types,
217            'by_manuscript': ref_by_ms,
218        },
219    })
220
221    # 4. Image catalog stats
222    cur.execute("SELECT COUNT(*) FROM images")
223    img_count = cur.fetchone()[0]
224    cur.execute("""
225        SELECT m.shelfmark, COUNT(*) FROM images i
226        JOIN manuscripts m ON i.manuscript_id = m.id
227        GROUP BY m.shelfmark
228    """)
229    img_by_ms = {row[0]: row[1] for row in cur.fetchall()}
230    cur.execute("""
231        SELECT page_type, COUNT(*) FROM images
232        GROUP BY page_type ORDER BY COUNT(*) DESC
233    """)
234    img_by_type = {row[0]: row[1] for row in cur.fetchall()}
235
236    data['sections'].append({
237        'id': 'image-catalog',
238        'title': 'Image Catalog Statistics',
239        'evidence_type': 'DB_QUERY',
240        'data': {
241            'total_images': img_count,
242            'by_manuscript': img_by_ms,
243            'by_type': img_by_type,
244        },
245    })
246
247    # 5. Match statistics
248    cur.execute("SELECT COUNT(*) FROM matches")
249    match_count = cur.fetchone()[0]
250    cur.execute("""
251        SELECT confidence, COUNT(*) FROM matches
252        GROUP BY confidence ORDER BY COUNT(*) DESC
253    """)
254    match_conf = {row[0]: row[1] for row in cur.fetchall()}
255    cur.execute("""
256        SELECT match_method, COUNT(*) FROM matches
257        GROUP BY match_method ORDER BY COUNT(*) DESC
258    """)
259    match_methods = {row[0]: row[1] for row in cur.fetchall()}
260
261    # BL vs Siena breakdown
262    cur.execute("""
263        SELECT m.shelfmark, mat.confidence, COUNT(*)
264        FROM matches mat
265        JOIN images i ON mat.image_id = i.id
266        JOIN manuscripts m ON i.manuscript_id = m.id
267        GROUP BY m.shelfmark, mat.confidence
268    """)
269    ms_conf = {}
270    for row in cur.fetchall():
271        ms_conf.setdefault(row[0], {})[row[1]] = row[2]
272
273    data['sections'].append({
274        'id': 'matching-stats',
275        'title': 'Matching Pipeline Statistics',
276        'evidence_type': 'DB_QUERY',
277        'data': {
278            'total_matches': match_count,
279            'by_confidence': match_conf,
280            'by_method': match_methods,
281            'by_manuscript_and_confidence': ms_conf,
282        },
283    })
284
285    # 6. Hand attribution stats
286    cur.execute("SELECT COUNT(*) FROM annotator_hands")
287    hand_count = cur.fetchone()[0]
288    cur.execute("""
289        SELECT hand_label, manuscript_shelfmark, attribution, is_alchemist
290        FROM annotator_hands ORDER BY manuscript_shelfmark
291    """)
292    hands = [dict(row) for row in cur.fetchall()]
293
294    data['sections'].append({
295        'id': 'hand-attribution',
296        'title': 'Hand Attribution Data',
297        'evidence_type': 'DB_QUERY',
298        'data': {
299            'total_hands': hand_count,
300            'hands': hands,
301        },
302    })
303
304    conn.close()
305
306    out_path = STAGING_DIR / "essay_concordance.json"
307    with open(out_path, 'w', encoding='utf-8') as f:
308        json.dump(data, f, indent=2, ensure_ascii=False)
309    print(f"Concordance essay data: {out_path}")
310    return data
311
312
313if __name__ == "__main__":
314    print("=== Building Essay Data ===\n")
315    STAGING_DIR.mkdir(exist_ok=True)
316    build_russell_essay_data()
317    build_concordance_essay_data()
318    print("\nDone.")