Build Essay Data
Extracts structured evidence from DB and corpus for the Russell and Concordance essays.
1"""Extract and structure evidence for the two essay pages. 2 3Produces: 4- staging/essay_russell.json: evidence for the Russell Alchemical Hands essay 5- staging/essay_concordance.json: evidence for the Concordance Methodology essay 6 7All data is retrieved evidence + DB queries. No generated interpretation. 8""" 9 10import sqlite3 11import json 12from pathlib import Path 13 14BASE_DIR = Path(__file__).resolve().parent.parent 15DB_PATH = BASE_DIR / "db" / "hp.db" 16STAGING_DIR = BASE_DIR / "staging" 17 18import sys 19sys.path.insert(0, str(BASE_DIR / "scripts")) 20from corpus_search import search_chunks 21 22 23def build_russell_essay_data(): 24 """Gather evidence for the Russell Alchemical Hands essay.""" 25 conn = sqlite3.connect(DB_PATH) 26 conn.row_factory = sqlite3.Row 27 cur = conn.cursor() 28 29 data = { 30 'title': "Russell's Research on the Alchemical Hands", 31 'source_method': 'CORPUS_EXTRACTION', 32 'sections': [], 33 } 34 35 # 1. Get all annotator hands, especially alchemists 36 cur.execute(""" 37 SELECT hand_label, manuscript_shelfmark, attribution, is_alchemist, 38 school, language, ink_color, date_range, description, interests 39 FROM annotator_hands ORDER BY manuscript_shelfmark, hand_label 40 """) 41 all_hands = [dict(row) for row in cur.fetchall()] 42 alchemist_hands = [h for h in all_hands if h['is_alchemist']] 43 44 data['sections'].append({ 45 'id': 'annotator-hands-overview', 46 'title': 'All Annotator Hands in the Corpus', 47 'evidence_type': 'DB_QUERY', 48 'data': all_hands, 49 }) 50 51 data['sections'].append({ 52 'id': 'alchemist-hands', 53 'title': 'Alchemist Annotators', 54 'evidence_type': 'DB_QUERY', 55 'data': alchemist_hands, 56 }) 57 58 # 2. Get dissertation refs attributed to alchemist hands 59 cur.execute(""" 60 SELECT r.signature_ref, r.thesis_page, r.context_text, r.marginal_text, 61 r.chapter_num, r.manuscript_shelfmark, 62 h.hand_label, h.attribution, h.school 63 FROM dissertation_refs r 64 JOIN annotator_hands h ON r.hand_id = h.id 65 WHERE h.is_alchemist = 1 66 ORDER BY r.thesis_page 67 """) 68 alchemist_refs = [dict(row) for row in cur.fetchall()] 69 70 data['sections'].append({ 71 'id': 'alchemist-refs', 72 'title': 'Dissertation References to Alchemical Annotations', 73 'evidence_type': 'DB_QUERY', 74 'count': len(alchemist_refs), 75 'data': alchemist_refs[:50], # cap for file size 76 }) 77 78 # 3. Get images matched to alchemist refs 79 cur.execute(""" 80 SELECT r.signature_ref, i.filename, COALESCE(i.web_path, i.relative_path) as relative_path, 81 m.shelfmark, mat.confidence, 82 h.hand_label, h.school 83 FROM matches mat 84 JOIN dissertation_refs r ON mat.ref_id = r.id 85 JOIN images i ON mat.image_id = i.id 86 JOIN manuscripts m ON i.manuscript_id = m.id 87 JOIN annotator_hands h ON r.hand_id = h.id 88 WHERE h.is_alchemist = 1 89 ORDER BY mat.confidence DESC, r.signature_ref 90 """) 91 alchemist_images = [dict(row) for row in cur.fetchall()] 92 93 data['sections'].append({ 94 'id': 'alchemist-images', 95 'title': 'Images Matched to Alchemical Annotations', 96 'evidence_type': 'DB_QUERY', 97 'count': len(alchemist_images), 98 'data': alchemist_images[:30], 99 }) 100 101 # 4. Search corpus for key alchemical evidence 102 searches = [ 103 ('BL alchemist Hand B', 'Hand B alchemist BL mercury'), 104 ('Buffalo alchemist Hand E', 'Hand E Buffalo alchemist Geber'), 105 ("d'Espagnet framework", "Espagnet mercury Enchiridion"), 106 ('Master Mercury flyleaf', 'Mercury Magisteri flyleaf verus sensus'), 107 ('Sol Luna Buffalo', 'Sol Luna gold silver Buffalo chess'), 108 ('alchemical ideograms', 'ideogram alchemical symbol sign'), 109 ] 110 111 for label, query in searches: 112 results = search_chunks(query, top_n=5) 113 data['sections'].append({ 114 'id': f'corpus-{label.lower().replace(" ", "-")}', 115 'title': f'Corpus Evidence: {label}', 116 'evidence_type': 'CORPUS_SEARCH', 117 'query': query, 118 'results': [{ 119 'source_doc': r['source_doc'], 120 'section': r['section'], 121 'page_refs': r['page_refs'], 122 'matched_text': r['matched_text'], 123 'relevance_score': r['relevance_score'], 124 } for r in results], 125 }) 126 127 # 5. Match statistics 128 cur.execute(""" 129 SELECT mat.confidence, COUNT(*) 130 FROM matches mat 131 JOIN dissertation_refs r ON mat.ref_id = r.id 132 JOIN annotator_hands h ON r.hand_id = h.id 133 WHERE h.is_alchemist = 1 134 GROUP BY mat.confidence 135 """) 136 confidence_dist = {row[0]: row[1] for row in cur.fetchall()} 137 138 data['sections'].append({ 139 'id': 'confidence-distribution', 140 'title': 'Confidence Distribution for Alchemist Matches', 141 'evidence_type': 'DB_QUERY', 142 'data': confidence_dist, 143 }) 144 145 conn.close() 146 147 out_path = STAGING_DIR / "essay_russell.json" 148 with open(out_path, 'w', encoding='utf-8') as f: 149 json.dump(data, f, indent=2, ensure_ascii=False) 150 print(f"Russell essay data: {out_path}") 151 return data 152 153 154def build_concordance_essay_data(): 155 """Gather evidence for the Concordance Methodology essay.""" 156 conn = sqlite3.connect(DB_PATH) 157 conn.row_factory = sqlite3.Row 158 cur = conn.cursor() 159 160 data = { 161 'title': 'Concordance Methodology', 162 'source_method': 'DETERMINISTIC', 163 'sections': [], 164 } 165 166 # 1. Manuscript overview 167 cur.execute("SELECT * FROM manuscripts") 168 manuscripts = [dict(row) for row in cur.fetchall()] 169 data['sections'].append({ 170 'id': 'manuscripts', 171 'title': 'Manuscripts in the Corpus', 172 'evidence_type': 'DB_QUERY', 173 'data': manuscripts, 174 }) 175 176 # 2. Signature map stats 177 cur.execute("SELECT COUNT(*) FROM signature_map") 178 sig_count = cur.fetchone()[0] 179 cur.execute("SELECT MIN(signature), MAX(signature) FROM signature_map") 180 sig_range = cur.fetchone() 181 cur.execute("SELECT quire, COUNT(*) FROM signature_map GROUP BY quire ORDER BY quire") 182 quire_counts = {row[0]: row[1] for row in cur.fetchall()} 183 184 data['sections'].append({ 185 'id': 'signature-map', 186 'title': 'Signature Map Statistics', 187 'evidence_type': 'DB_QUERY', 188 'data': { 189 'total_signatures': sig_count, 190 'range': [sig_range[0], sig_range[1]], 191 'quires': quire_counts, 192 }, 193 }) 194 195 # 3. Dissertation refs stats 196 cur.execute("SELECT COUNT(*) FROM dissertation_refs") 197 ref_count = cur.fetchone()[0] 198 cur.execute(""" 199 SELECT ref_type, COUNT(*) FROM dissertation_refs 200 GROUP BY ref_type ORDER BY COUNT(*) DESC 201 """) 202 ref_types = {row[0]: row[1] for row in cur.fetchall()} 203 cur.execute(""" 204 SELECT manuscript_shelfmark, COUNT(*) FROM dissertation_refs 205 WHERE manuscript_shelfmark IS NOT NULL 206 GROUP BY manuscript_shelfmark ORDER BY COUNT(*) DESC 207 """) 208 ref_by_ms = {row[0]: row[1] for row in cur.fetchall()} 209 210 data['sections'].append({ 211 'id': 'dissertation-refs', 212 'title': 'Dissertation Reference Extraction', 213 'evidence_type': 'DB_QUERY', 214 'data': { 215 'total_refs': ref_count, 216 'by_type': ref_types, 217 'by_manuscript': ref_by_ms, 218 }, 219 }) 220 221 # 4. Image catalog stats 222 cur.execute("SELECT COUNT(*) FROM images") 223 img_count = cur.fetchone()[0] 224 cur.execute(""" 225 SELECT m.shelfmark, COUNT(*) FROM images i 226 JOIN manuscripts m ON i.manuscript_id = m.id 227 GROUP BY m.shelfmark 228 """) 229 img_by_ms = {row[0]: row[1] for row in cur.fetchall()} 230 cur.execute(""" 231 SELECT page_type, COUNT(*) FROM images 232 GROUP BY page_type ORDER BY COUNT(*) DESC 233 """) 234 img_by_type = {row[0]: row[1] for row in cur.fetchall()} 235 236 data['sections'].append({ 237 'id': 'image-catalog', 238 'title': 'Image Catalog Statistics', 239 'evidence_type': 'DB_QUERY', 240 'data': { 241 'total_images': img_count, 242 'by_manuscript': img_by_ms, 243 'by_type': img_by_type, 244 }, 245 }) 246 247 # 5. Match statistics 248 cur.execute("SELECT COUNT(*) FROM matches") 249 match_count = cur.fetchone()[0] 250 cur.execute(""" 251 SELECT confidence, COUNT(*) FROM matches 252 GROUP BY confidence ORDER BY COUNT(*) DESC 253 """) 254 match_conf = {row[0]: row[1] for row in cur.fetchall()} 255 cur.execute(""" 256 SELECT match_method, COUNT(*) FROM matches 257 GROUP BY match_method ORDER BY COUNT(*) DESC 258 """) 259 match_methods = {row[0]: row[1] for row in cur.fetchall()} 260 261 # BL vs Siena breakdown 262 cur.execute(""" 263 SELECT m.shelfmark, mat.confidence, COUNT(*) 264 FROM matches mat 265 JOIN images i ON mat.image_id = i.id 266 JOIN manuscripts m ON i.manuscript_id = m.id 267 GROUP BY m.shelfmark, mat.confidence 268 """) 269 ms_conf = {} 270 for row in cur.fetchall(): 271 ms_conf.setdefault(row[0], {})[row[1]] = row[2] 272 273 data['sections'].append({ 274 'id': 'matching-stats', 275 'title': 'Matching Pipeline Statistics', 276 'evidence_type': 'DB_QUERY', 277 'data': { 278 'total_matches': match_count, 279 'by_confidence': match_conf, 280 'by_method': match_methods, 281 'by_manuscript_and_confidence': ms_conf, 282 }, 283 }) 284 285 # 6. Hand attribution stats 286 cur.execute("SELECT COUNT(*) FROM annotator_hands") 287 hand_count = cur.fetchone()[0] 288 cur.execute(""" 289 SELECT hand_label, manuscript_shelfmark, attribution, is_alchemist 290 FROM annotator_hands ORDER BY manuscript_shelfmark 291 """) 292 hands = [dict(row) for row in cur.fetchall()] 293 294 data['sections'].append({ 295 'id': 'hand-attribution', 296 'title': 'Hand Attribution Data', 297 'evidence_type': 'DB_QUERY', 298 'data': { 299 'total_hands': hand_count, 300 'hands': hands, 301 }, 302 }) 303 304 conn.close() 305 306 out_path = STAGING_DIR / "essay_concordance.json" 307 with open(out_path, 'w', encoding='utf-8') as f: 308 json.dump(data, f, indent=2, ensure_ascii=False) 309 print(f"Concordance essay data: {out_path}") 310 return data 311 312 313if __name__ == "__main__": 314 print("=== Building Essay Data ===\n") 315 STAGING_DIR.mkdir(exist_ok=True) 316 build_russell_essay_data() 317 build_concordance_essay_data() 318 print("\nDone.")