Export Showcase Data (Legacy)
Original data.json exporter for the gallery. Superseded by build_site.py.
1"""Export matched references as JSON for the static showcase page.""" 2 3import sqlite3 4import json 5from pathlib import Path 6 7BASE_DIR = Path(__file__).resolve().parent.parent 8DB_PATH = BASE_DIR / "db" / "hp.db" 9OUTPUT_PATH = BASE_DIR / "site" / "data.json" 10 11 12def main(): 13 conn = sqlite3.connect(DB_PATH) 14 conn.row_factory = sqlite3.Row 15 cur = conn.cursor() 16 17 # Get all HIGH confidence matches (Ch6 BL + Ch9 Siena) plus all matches for showcase 18 cur.execute(""" 19 SELECT 20 dr.id as ref_id, 21 dr.thesis_page, 22 dr.signature_ref, 23 dr.manuscript_shelfmark, 24 dr.context_text, 25 dr.marginal_text, 26 dr.chapter_num, 27 i.filename as image_filename, 28 i.folio_number, 29 i.side, 30 i.page_type, 31 COALESCE(i.web_path, i.relative_path) as relative_path, 32 m.confidence, 33 m.match_method, 34 ms.shelfmark, 35 ms.institution, 36 ms.city, 37 sm.quire, 38 sm.leaf_in_quire 39 FROM matches m 40 JOIN dissertation_refs dr ON m.ref_id = dr.id 41 JOIN images i ON m.image_id = i.id 42 JOIN manuscripts ms ON i.manuscript_id = ms.id 43 LEFT JOIN signature_map sm ON sm.signature = dr.signature_ref 44 WHERE m.confidence = 'HIGH' 45 AND i.page_type = 'PAGE' 46 ORDER BY sm.folio_number, i.side, dr.thesis_page 47 """) 48 49 entries = [] 50 seen = set() # Deduplicate by (signature, image) 51 52 for row in cur.fetchall(): 53 key = (row['signature_ref'], row['image_filename']) 54 if key in seen: 55 continue 56 seen.add(key) 57 58 entries.append({ 59 'ref_id': row['ref_id'], 60 'thesis_page': row['thesis_page'], 61 'signature': row['signature_ref'], 62 'manuscript': row['shelfmark'], 63 'institution': row['institution'], 64 'city': row['city'], 65 'context': row['context_text'], 66 'marginal_text': row['marginal_text'], 67 'chapter': row['chapter_num'], 68 'image_file': row['image_filename'], 69 'image_path': row['relative_path'], 70 'folio_number': row['folio_number'], 71 'side': row['side'], 72 'confidence': row['confidence'], 73 'quire': row['quire'], 74 'leaf_in_quire': row['leaf_in_quire'], 75 }) 76 77 # Also get manuscript info for the page 78 cur.execute("SELECT shelfmark, institution, city, description, image_count FROM manuscripts") 79 manuscripts = [dict(row) for row in cur.fetchall()] 80 81 # Summary stats 82 cur.execute("SELECT COUNT(DISTINCT signature_ref) FROM dissertation_refs") 83 unique_sigs = cur.fetchone()[0] 84 cur.execute("SELECT COUNT(*) FROM dissertation_refs") 85 total_refs = cur.fetchone()[0] 86 87 data = { 88 'entries': entries, 89 'manuscripts': manuscripts, 90 'stats': { 91 'total_references': total_refs, 92 'unique_signatures': unique_sigs, 93 'high_confidence_matches': len(entries), 94 } 95 } 96 97 OUTPUT_PATH.parent.mkdir(exist_ok=True) 98 with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: 99 json.dump(data, f, indent=2, ensure_ascii=False) 100 101 print(f"Exported {len(entries)} showcase entries to {OUTPUT_PATH}") 102 print(f" BL C.60.o.12: {sum(1 for e in entries if e['manuscript'] == 'C.60.o.12')}") 103 print(f" Siena O.III.38: {sum(1 for e in entries if e['manuscript'] == 'O.III.38')}") 104 105 # Show a few entries 106 print("\nSample entries:") 107 for e in entries[:5]: 108 print(f" {e['signature']} [{e['manuscript']}] -> {e['image_file']}") 109 if e['marginal_text']: 110 print(f" Marginal: '{e['marginal_text'][:60]}'") 111 112 conn.close() 113 114 115if __name__ == "__main__": 116 main()