Match Refs to Images
SQL join pipeline matching dissertation references to manuscript images via the signature map.
1"""Match dissertation references to manuscript images via signature mapping.""" 2 3import sqlite3 4from pathlib import Path 5 6BASE_DIR = Path(__file__).resolve().parent.parent 7DB_PATH = BASE_DIR / "db" / "hp.db" 8 9 10def main(): 11 conn = sqlite3.connect(DB_PATH) 12 cur = conn.cursor() 13 14 # Clear existing matches 15 cur.execute("DELETE FROM matches") 16 17 # Get all dissertation refs 18 cur.execute("""SELECT id, signature_ref, manuscript_shelfmark, chapter_num 19 FROM dissertation_refs""") 20 refs = cur.fetchall() 21 22 stats = {'total': len(refs), 'matched': 0, 'unmatched': 0, 23 'high': 0, 'medium': 0, 'low': 0} 24 25 for ref_id, sig_ref, ms_shelfmark, chapter_num in refs: 26 # Look up signature in the mapping table 27 # Try lowercase first (Russell typically uses lowercase) 28 cur.execute("""SELECT folio_number, side FROM signature_map 29 WHERE signature = ? OR signature = ?""", 30 (sig_ref, sig_ref.lower())) 31 sig_row = cur.fetchone() 32 33 if not sig_row: 34 stats['unmatched'] += 1 35 continue 36 37 folio_num, side = sig_row 38 39 # Find matching image(s) for this manuscript 40 if ms_shelfmark: 41 # Try exact manuscript match 42 cur.execute(""" 43 SELECT i.id FROM images i 44 JOIN manuscripts m ON i.manuscript_id = m.id 45 WHERE m.shelfmark = ? 46 AND i.page_type = 'PAGE' 47 AND (i.folio_number = ? OR i.folio_number = ?) 48 ORDER BY i.sort_order 49 """, (ms_shelfmark, str(folio_num), str(folio_num).zfill(4))) 50 image_rows = cur.fetchall() 51 52 if image_rows: 53 # If side is specified and Siena images have side info, prefer exact match 54 if side and ms_shelfmark == 'O.III.38': 55 cur.execute(""" 56 SELECT i.id FROM images i 57 JOIN manuscripts m ON i.manuscript_id = m.id 58 WHERE m.shelfmark = ? 59 AND i.page_type = 'PAGE' 60 AND (i.folio_number = ? OR i.folio_number = ?) 61 AND i.side = ? 62 """, (ms_shelfmark, str(folio_num), str(folio_num).zfill(4), side)) 63 exact_side = cur.fetchall() 64 if exact_side: 65 image_rows = exact_side 66 67 for (image_id,) in image_rows: 68 confidence = 'HIGH' 69 method = 'SIGNATURE_EXACT' 70 cur.execute(""" 71 INSERT INTO matches (ref_id, image_id, match_method, confidence, needs_review) 72 VALUES (?, ?, ?, ?, ?) 73 """, (ref_id, image_id, method, confidence, 0)) 74 stats['matched'] += 1 75 stats['high'] += 1 76 else: 77 # Try matching across all manuscripts as fallback 78 cur.execute(""" 79 SELECT i.id, m.shelfmark FROM images i 80 JOIN manuscripts m ON i.manuscript_id = m.id 81 WHERE i.page_type = 'PAGE' 82 AND (i.folio_number = ? OR i.folio_number = ?) 83 ORDER BY i.sort_order 84 """, (str(folio_num), str(folio_num).zfill(4))) 85 fallback_rows = cur.fetchall() 86 87 if fallback_rows: 88 for image_id, found_ms in fallback_rows: 89 cur.execute(""" 90 INSERT INTO matches (ref_id, image_id, match_method, confidence, needs_review) 91 VALUES (?, ?, ?, ?, ?) 92 """, (ref_id, image_id, 'FOLIO_EXACT', 'MEDIUM', 1)) 93 stats['matched'] += 1 94 stats['medium'] += 1 95 else: 96 stats['unmatched'] += 1 97 else: 98 stats['unmatched'] += 1 99 100 conn.commit() 101 102 # Print statistics 103 print("Match Statistics:") 104 print(f" Total references: {stats['total']}") 105 print(f" Matched: {stats['matched']}") 106 print(f" Unmatched: {stats['unmatched']}") 107 print(f" HIGH confidence: {stats['high']}") 108 print(f" MEDIUM confidence: {stats['medium']}") 109 print(f" LOW confidence: {stats['low']}") 110 111 # Show unmatched refs 112 cur.execute(""" 113 SELECT dr.id, dr.thesis_page, dr.signature_ref, dr.manuscript_shelfmark 114 FROM dissertation_refs dr 115 WHERE dr.id NOT IN (SELECT ref_id FROM matches) 116 LIMIT 20 117 """) 118 unmatched = cur.fetchall() 119 if unmatched: 120 print(f"\nUnmatched references (first 20):") 121 for ref_id, page, sig, ms in unmatched: 122 print(f" ref {ref_id}: p.{page} {sig} [{ms}]") 123 124 # Show sample matches 125 print("\nSample matches:") 126 cur.execute(""" 127 SELECT dr.thesis_page, dr.signature_ref, dr.manuscript_shelfmark, 128 i.filename, m.confidence 129 FROM matches m 130 JOIN dissertation_refs dr ON m.ref_id = dr.id 131 JOIN images i ON m.image_id = i.id 132 LIMIT 10 133 """) 134 for page, sig, ms, img_file, conf in cur.fetchall(): 135 print(f" p.{page} {sig} [{ms}] -> {img_file} ({conf})") 136 137 conn.close() 138 print("\nDone.") 139 140 141if __name__ == "__main__": 142 main()