Alchemical Hands in the Hypnerotomachia Poliphili

Marginalia, Scholarship & Reception

← All Scripts

Match Refs to Images

match_refs_to_images.py — 142 lines

SQL join pipeline matching dissertation references to manuscript images via the signature map.

1"""Match dissertation references to manuscript images via signature mapping."""
2
3import sqlite3
4from pathlib import Path
5
6BASE_DIR = Path(__file__).resolve().parent.parent
7DB_PATH = BASE_DIR / "db" / "hp.db"
8
9
10def main():
11    conn = sqlite3.connect(DB_PATH)
12    cur = conn.cursor()
13
14    # Clear existing matches
15    cur.execute("DELETE FROM matches")
16
17    # Get all dissertation refs
18    cur.execute("""SELECT id, signature_ref, manuscript_shelfmark, chapter_num
19                   FROM dissertation_refs""")
20    refs = cur.fetchall()
21
22    stats = {'total': len(refs), 'matched': 0, 'unmatched': 0,
23             'high': 0, 'medium': 0, 'low': 0}
24
25    for ref_id, sig_ref, ms_shelfmark, chapter_num in refs:
26        # Look up signature in the mapping table
27        # Try lowercase first (Russell typically uses lowercase)
28        cur.execute("""SELECT folio_number, side FROM signature_map
29                       WHERE signature = ? OR signature = ?""",
30                    (sig_ref, sig_ref.lower()))
31        sig_row = cur.fetchone()
32
33        if not sig_row:
34            stats['unmatched'] += 1
35            continue
36
37        folio_num, side = sig_row
38
39        # Find matching image(s) for this manuscript
40        if ms_shelfmark:
41            # Try exact manuscript match
42            cur.execute("""
43                SELECT i.id FROM images i
44                JOIN manuscripts m ON i.manuscript_id = m.id
45                WHERE m.shelfmark = ?
46                AND i.page_type = 'PAGE'
47                AND (i.folio_number = ? OR i.folio_number = ?)
48                ORDER BY i.sort_order
49            """, (ms_shelfmark, str(folio_num), str(folio_num).zfill(4)))
50            image_rows = cur.fetchall()
51
52            if image_rows:
53                # If side is specified and Siena images have side info, prefer exact match
54                if side and ms_shelfmark == 'O.III.38':
55                    cur.execute("""
56                        SELECT i.id FROM images i
57                        JOIN manuscripts m ON i.manuscript_id = m.id
58                        WHERE m.shelfmark = ?
59                        AND i.page_type = 'PAGE'
60                        AND (i.folio_number = ? OR i.folio_number = ?)
61                        AND i.side = ?
62                    """, (ms_shelfmark, str(folio_num), str(folio_num).zfill(4), side))
63                    exact_side = cur.fetchall()
64                    if exact_side:
65                        image_rows = exact_side
66
67                for (image_id,) in image_rows:
68                    confidence = 'HIGH'
69                    method = 'SIGNATURE_EXACT'
70                    cur.execute("""
71                        INSERT INTO matches (ref_id, image_id, match_method, confidence, needs_review)
72                        VALUES (?, ?, ?, ?, ?)
73                    """, (ref_id, image_id, method, confidence, 0))
74                    stats['matched'] += 1
75                    stats['high'] += 1
76            else:
77                # Try matching across all manuscripts as fallback
78                cur.execute("""
79                    SELECT i.id, m.shelfmark FROM images i
80                    JOIN manuscripts m ON i.manuscript_id = m.id
81                    WHERE i.page_type = 'PAGE'
82                    AND (i.folio_number = ? OR i.folio_number = ?)
83                    ORDER BY i.sort_order
84                """, (str(folio_num), str(folio_num).zfill(4)))
85                fallback_rows = cur.fetchall()
86
87                if fallback_rows:
88                    for image_id, found_ms in fallback_rows:
89                        cur.execute("""
90                            INSERT INTO matches (ref_id, image_id, match_method, confidence, needs_review)
91                            VALUES (?, ?, ?, ?, ?)
92                        """, (ref_id, image_id, 'FOLIO_EXACT', 'MEDIUM', 1))
93                        stats['matched'] += 1
94                        stats['medium'] += 1
95                else:
96                    stats['unmatched'] += 1
97        else:
98            stats['unmatched'] += 1
99
100    conn.commit()
101
102    # Print statistics
103    print("Match Statistics:")
104    print(f"  Total references: {stats['total']}")
105    print(f"  Matched: {stats['matched']}")
106    print(f"  Unmatched: {stats['unmatched']}")
107    print(f"  HIGH confidence: {stats['high']}")
108    print(f"  MEDIUM confidence: {stats['medium']}")
109    print(f"  LOW confidence: {stats['low']}")
110
111    # Show unmatched refs
112    cur.execute("""
113        SELECT dr.id, dr.thesis_page, dr.signature_ref, dr.manuscript_shelfmark
114        FROM dissertation_refs dr
115        WHERE dr.id NOT IN (SELECT ref_id FROM matches)
116        LIMIT 20
117    """)
118    unmatched = cur.fetchall()
119    if unmatched:
120        print(f"\nUnmatched references (first 20):")
121        for ref_id, page, sig, ms in unmatched:
122            print(f"  ref {ref_id}: p.{page} {sig} [{ms}]")
123
124    # Show sample matches
125    print("\nSample matches:")
126    cur.execute("""
127        SELECT dr.thesis_page, dr.signature_ref, dr.manuscript_shelfmark,
128               i.filename, m.confidence
129        FROM matches m
130        JOIN dissertation_refs dr ON m.ref_id = dr.id
131        JOIN images i ON m.image_id = i.id
132        LIMIT 10
133    """)
134    for page, sig, ms, img_file, conf in cur.fetchall():
135        print(f"  p.{page} {sig} [{ms}] -> {img_file} ({conf})")
136
137    conn.close()
138    print("\nDone.")
139
140
141if __name__ == "__main__":
142    main()