Extract References
Uses PyMuPDF + regex to extract 282 folio/signature references from Russell's PhD thesis PDF.
1"""Extract folio/signature references from Russell's PhD dissertation.""" 2 3import sqlite3 4import re 5from pathlib import Path 6 7try: 8 import fitz # PyMuPDF 9except ImportError: 10 print("ERROR: PyMuPDF not installed. Run: pip install PyMuPDF") 11 raise 12 13BASE_DIR = Path(__file__).resolve().parent.parent 14DB_PATH = BASE_DIR / "db" / "hp.db" 15THESIS_FILENAME = "PhD_Thesis_ _James_Russell Hypnerotomachia Polyphili.pdf" 16THESIS_PATH = BASE_DIR / THESIS_FILENAME 17 18# Chapter page ranges (approximate, from TOC - 1-indexed PDF pages) 19# These will be refined after first extraction pass 20CHAPTER_RANGES = { 21 1: (1, 40), # The HP and its readership 22 2: (41, 60), # Literature Review 23 3: (61, 80), # Methodology 24 4: (81, 100), # Modena - F.C. Panini Estate 25 5: (101, 122), # Como - INCUN A.5.13 26 6: (123, 170), # London - BL C.60.o.12 27 7: (171, 203), # Buffalo 28 8: (204, 230), # Vatican 29 9: (204, 250), # Siena O.III.38 + Sydney 30 10: (251, 262), # Conclusions 31} 32 33# Manuscript shelfmark by chapter 34CHAPTER_MANUSCRIPTS = { 35 4: 'Modena (Panini)', 36 5: 'INCUN A.5.13', 37 6: 'C.60.o.12', 38 7: 'Buffalo RBR', 39 8: 'Inc.Stam.Chig.II.610', 40 9: 'O.III.38', 41} 42 43# Regex patterns for signature references 44# Matches: (a4r), (c7v), (p6v), a7r:, i6r, etc. 45SIG_PATTERN = re.compile( 46 r'(?:\(([a-zA-Z]{1,2}\d[rv])\))' # parenthesized: (a4r) 47 r'|(?:\b([a-zA-Z]{1,2}\d[rv]):)' # with colon: a4r: 48 r'|(?:\b([a-zA-Z]{1,2}\d[rv])\b)' # bare: a4r (more false positives) 49) 50 51# Pattern for quoted marginal text (text in single quotes near a signature ref) 52MARGINAL_QUOTE = re.compile(r"['\u2018]([^'\u2019]{3,200})['\u2019]") 53 54 55def get_chapter(page_num): 56 """Determine chapter number from PDF page number.""" 57 for ch, (start, end) in CHAPTER_RANGES.items(): 58 if start <= page_num <= end: 59 return ch 60 return None 61 62 63def get_manuscript(chapter_num): 64 """Get manuscript shelfmark from chapter number.""" 65 return CHAPTER_MANUSCRIPTS.get(chapter_num) 66 67 68def extract_context(text, match_start, match_end, window=500): 69 """Extract surrounding paragraph context.""" 70 # Find paragraph boundaries (double newline or start/end of text) 71 para_start = text.rfind('\n\n', 0, match_start) 72 para_start = para_start + 2 if para_start >= 0 else max(0, match_start - window) 73 74 para_end = text.find('\n\n', match_end) 75 para_end = para_end if para_end >= 0 else min(len(text), match_end + window) 76 77 return text[para_start:para_end].strip() 78 79 80def extract_marginal_text(context, sig_pos): 81 """Find quoted text near a signature reference.""" 82 quotes = list(MARGINAL_QUOTE.finditer(context)) 83 if not quotes: 84 return None 85 86 # Find the quote closest to the signature reference 87 closest = min(quotes, key=lambda m: abs(m.start() - sig_pos)) 88 return closest.group(1) 89 90 91def main(): 92 if not THESIS_PATH.exists(): 93 print(f"ERROR: Thesis not found at {THESIS_PATH}") 94 return 95 96 print(f"Opening {THESIS_FILENAME}...") 97 doc = fitz.open(str(THESIS_PATH)) 98 total_pages = len(doc) 99 print(f" {total_pages} pages") 100 101 conn = sqlite3.connect(DB_PATH) 102 cur = conn.cursor() 103 104 # Clear existing refs 105 cur.execute("DELETE FROM dissertation_refs") 106 107 ref_count = 0 108 all_sigs = set() 109 110 for page_idx in range(total_pages): 111 page = doc[page_idx] 112 text = page.get_text() 113 page_num = page_idx + 1 # 1-indexed 114 115 for match in SIG_PATTERN.finditer(text): 116 # Get the matched signature from whichever group matched 117 sig = match.group(1) or match.group(2) or match.group(3) 118 if not sig: 119 continue 120 121 # Skip very short matches that are likely false positives 122 # (e.g., "in" + digit + r/v) 123 quire_part = re.match(r'([a-zA-Z]+)', sig).group(1) 124 if quire_part.lower() in ('in', 'an', 'on', 'or', 'as', 'at', 'is', 'it', 125 'no', 'so', 'to', 'do', 'go', 'he', 'me', 'we', 126 'be', 'of', 'if', 'up', 'my', 'by'): 127 continue 128 129 all_sigs.add(sig) 130 chapter = get_chapter(page_num) 131 manuscript = get_manuscript(chapter) 132 133 context = extract_context(text, match.start(), match.end()) 134 marginal = extract_marginal_text(context, match.start() - (text.rfind('\n\n', 0, match.start()) or 0)) 135 136 cur.execute( 137 """INSERT INTO dissertation_refs 138 (thesis_page, signature_ref, manuscript_shelfmark, 139 context_text, marginal_text, ref_type, chapter_num) 140 VALUES (?, ?, ?, ?, ?, ?, ?)""", 141 (page_num, sig, manuscript, context, marginal, 'MARGINALIA', chapter) 142 ) 143 ref_count += 1 144 145 conn.commit() 146 doc.close() 147 148 print(f"\nExtracted {ref_count} references") 149 print(f"Unique signatures: {len(all_sigs)}") 150 151 # Show distribution by chapter 152 print("\nReferences by chapter:") 153 for ch in range(1, 11): 154 cur.execute("SELECT COUNT(*) FROM dissertation_refs WHERE chapter_num = ?", (ch,)) 155 count = cur.fetchone()[0] 156 ms = CHAPTER_MANUSCRIPTS.get(ch, '') 157 if count > 0: 158 print(f" Ch {ch} ({ms}): {count} refs") 159 160 # Show sample references 161 print("\nSample references:") 162 cur.execute("""SELECT thesis_page, signature_ref, manuscript_shelfmark, 163 substr(context_text, 1, 100), marginal_text 164 FROM dissertation_refs LIMIT 10""") 165 for row in cur.fetchall(): 166 page, sig, ms, ctx, marg = row 167 print(f" p.{page} {sig} [{ms}]: {ctx}...") 168 if marg: 169 print(f" Marginal text: '{marg[:80]}'") 170 171 conn.close() 172 print("\nDone.") 173 174 175if __name__ == "__main__": 176 main()