Extract References

extract_references.py — 176 lines
Uses PyMuPDF + regex to extract 282 folio/signature references from Russell's PhD thesis PDF.
1"""Extract folio/signature references from Russell's PhD dissertation."""
2
3import sqlite3
4import re
5from pathlib import Path
6
7try:
8    import fitz  # PyMuPDF
9except ImportError:
10    print("ERROR: PyMuPDF not installed. Run: pip install PyMuPDF")
11    raise
12
13BASE_DIR = Path(__file__).resolve().parent.parent
14DB_PATH = BASE_DIR / "db" / "hp.db"
15THESIS_FILENAME = "PhD_Thesis_ _James_Russell Hypnerotomachia Polyphili.pdf"
16THESIS_PATH = BASE_DIR / THESIS_FILENAME
17
18# Chapter page ranges (approximate, from TOC - 1-indexed PDF pages)
19# These will be refined after first extraction pass
20CHAPTER_RANGES = {
21    1: (1, 40),      # The HP and its readership
22    2: (41, 60),     # Literature Review
23    3: (61, 80),     # Methodology
24    4: (81, 100),    # Modena - F.C. Panini Estate
25    5: (101, 122),   # Como - INCUN A.5.13
26    6: (123, 170),   # London - BL C.60.o.12
27    7: (171, 203),   # Buffalo
28    8: (204, 230),   # Vatican
29    9: (204, 250),   # Siena O.III.38 + Sydney
30    10: (251, 262),  # Conclusions
31}
32
33# Manuscript shelfmark by chapter
34CHAPTER_MANUSCRIPTS = {
35    4: 'Modena (Panini)',
36    5: 'INCUN A.5.13',
37    6: 'C.60.o.12',
38    7: 'Buffalo RBR',
39    8: 'Inc.Stam.Chig.II.610',
40    9: 'O.III.38',
41}
42
43# Regex patterns for signature references
44# Matches: (a4r), (c7v), (p6v), a7r:, i6r, etc.
45SIG_PATTERN = re.compile(
46    r'(?:\(([a-zA-Z]{1,2}\d[rv])\))'  # parenthesized: (a4r)
47    r'|(?:\b([a-zA-Z]{1,2}\d[rv]):)'   # with colon: a4r:
48    r'|(?:\b([a-zA-Z]{1,2}\d[rv])\b)'  # bare: a4r (more false positives)
49)
50
51# Pattern for quoted marginal text (text in single quotes near a signature ref)
52MARGINAL_QUOTE = re.compile(r"['\u2018]([^'\u2019]{3,200})['\u2019]")
53
54
55def get_chapter(page_num):
56    """Determine chapter number from PDF page number."""
57    for ch, (start, end) in CHAPTER_RANGES.items():
58        if start <= page_num <= end:
59            return ch
60    return None
61
62
63def get_manuscript(chapter_num):
64    """Get manuscript shelfmark from chapter number."""
65    return CHAPTER_MANUSCRIPTS.get(chapter_num)
66
67
68def extract_context(text, match_start, match_end, window=500):
69    """Extract surrounding paragraph context."""
70    # Find paragraph boundaries (double newline or start/end of text)
71    para_start = text.rfind('\n\n', 0, match_start)
72    para_start = para_start + 2 if para_start >= 0 else max(0, match_start - window)
73
74    para_end = text.find('\n\n', match_end)
75    para_end = para_end if para_end >= 0 else min(len(text), match_end + window)
76
77    return text[para_start:para_end].strip()
78
79
80def extract_marginal_text(context, sig_pos):
81    """Find quoted text near a signature reference."""
82    quotes = list(MARGINAL_QUOTE.finditer(context))
83    if not quotes:
84        return None
85
86    # Find the quote closest to the signature reference
87    closest = min(quotes, key=lambda m: abs(m.start() - sig_pos))
88    return closest.group(1)
89
90
91def main():
92    if not THESIS_PATH.exists():
93        print(f"ERROR: Thesis not found at {THESIS_PATH}")
94        return
95
96    print(f"Opening {THESIS_FILENAME}...")
97    doc = fitz.open(str(THESIS_PATH))
98    total_pages = len(doc)
99    print(f"  {total_pages} pages")
100
101    conn = sqlite3.connect(DB_PATH)
102    cur = conn.cursor()
103
104    # Clear existing refs
105    cur.execute("DELETE FROM dissertation_refs")
106
107    ref_count = 0
108    all_sigs = set()
109
110    for page_idx in range(total_pages):
111        page = doc[page_idx]
112        text = page.get_text()
113        page_num = page_idx + 1  # 1-indexed
114
115        for match in SIG_PATTERN.finditer(text):
116            # Get the matched signature from whichever group matched
117            sig = match.group(1) or match.group(2) or match.group(3)
118            if not sig:
119                continue
120
121            # Skip very short matches that are likely false positives
122            # (e.g., "in" + digit + r/v)
123            quire_part = re.match(r'([a-zA-Z]+)', sig).group(1)
124            if quire_part.lower() in ('in', 'an', 'on', 'or', 'as', 'at', 'is', 'it',
125                                       'no', 'so', 'to', 'do', 'go', 'he', 'me', 'we',
126                                       'be', 'of', 'if', 'up', 'my', 'by'):
127                continue
128
129            all_sigs.add(sig)
130            chapter = get_chapter(page_num)
131            manuscript = get_manuscript(chapter)
132
133            context = extract_context(text, match.start(), match.end())
134            marginal = extract_marginal_text(context, match.start() - (text.rfind('\n\n', 0, match.start()) or 0))
135
136            cur.execute(
137                """INSERT INTO dissertation_refs
138                   (thesis_page, signature_ref, manuscript_shelfmark,
139                    context_text, marginal_text, ref_type, chapter_num)
140                   VALUES (?, ?, ?, ?, ?, ?, ?)""",
141                (page_num, sig, manuscript, context, marginal, 'MARGINALIA', chapter)
142            )
143            ref_count += 1
144
145    conn.commit()
146    doc.close()
147
148    print(f"\nExtracted {ref_count} references")
149    print(f"Unique signatures: {len(all_sigs)}")
150
151    # Show distribution by chapter
152    print("\nReferences by chapter:")
153    for ch in range(1, 11):
154        cur.execute("SELECT COUNT(*) FROM dissertation_refs WHERE chapter_num = ?", (ch,))
155        count = cur.fetchone()[0]
156        ms = CHAPTER_MANUSCRIPTS.get(ch, '')
157        if count > 0:
158            print(f"  Ch {ch} ({ms}): {count} refs")
159
160    # Show sample references
161    print("\nSample references:")
162    cur.execute("""SELECT thesis_page, signature_ref, manuscript_shelfmark,
163                   substr(context_text, 1, 100), marginal_text
164                   FROM dissertation_refs LIMIT 10""")
165    for row in cur.fetchall():
166        page, sig, ms, ctx, marg = row
167        print(f"  p.{page} {sig} [{ms}]: {ctx}...")
168        if marg:
169            print(f"    Marginal text: '{marg[:80]}'")
170
171    conn.close()
172    print("\nDone.")
173
174
175if __name__ == "__main__":
176    main()