Alchemical Hands in the Hypnerotomachia Poliphili

Marginalia, Scholarship & Reception

← All Scripts

PDF to Markdown

pdf_to_markdown.py — 373 lines

Extracts all PDFs to markdown with YAML frontmatter, page markers, and metadata lookup.

1"""Extract all PDFs to markdown files with YAML frontmatter."""
2
3import re
4import os
5from pathlib import Path
6
7try:
8    import fitz  # PyMuPDF
9except ImportError:
10    print("ERROR: PyMuPDF not installed. Run: pip install PyMuPDF")
11    raise
12
13BASE_DIR = Path(__file__).resolve().parent.parent
14MD_DIR = BASE_DIR / "md"
15MD_DIR.mkdir(exist_ok=True)
16
17# Known metadata mappings for better frontmatter
18KNOWN_METADATA = {
19    'PhD_Thesis_ _James_Russell': {
20        'title': 'Hypnerotomachia Poliphili: A Study of Marginal Annotations in Six Copies',
21        'author': 'James Russell',
22        'year': 2024,
23        'doc_type': 'DISSERTATION',
24    },
25    'E_Thesis_Durham_University_Self_Transfor Oneill': {
26        'title': 'Self-transformation in the Hypnerotomachia Poliphili',
27        'author': "James O'Neill",
28        'year': 2025,
29        'journal': 'Durham University (E-Thesis)',
30        'doc_type': 'DISSERTATION',
31    },
32    'A_Narrative_in_Search_of_an_Author': {
33        'title': 'A Narrative in Search of an Author: The Hypnerotomachia Poliphili',
34        'author': "James O'Neill",
35        'year': 2025,
36        'doc_type': 'SCHOLARSHIP',
37    },
38    'Francesco Colonna Hypnerotomachia Poliphili Da Capo': {
39        'title': 'Hypnerotomachia Poliphili (Da Capo Press edition)',
40        'author': 'Francesco Colonna',
41        'year': 1499,
42        'doc_type': 'PRIMARY_TEXT',
43    },
44    'Francesco Colonna Rino Avesani': {
45        'title': 'Hypnerotomachia Poliphili, Vol. 1 (Antenore critical edition)',
46        'author': 'Francesco Colonna (ed. Pozzi & Ciapponi)',
47        'year': 1964,
48        'doc_type': 'PRIMARY_TEXT',
49    },
50    'Hypnerotomachia by Francesco Colonna': {
51        'title': 'Hypnerotomachia Poliphili',
52        'author': 'Francesco Colonna',
53        'year': 1499,
54        'doc_type': 'PRIMARY_TEXT',
55    },
56    'The HP of Ben Jonson': {
57        'title': 'The HP of Ben Jonson and Kenelm Digby',
58        'author': 'Unknown',
59        'year': 2025,
60        'doc_type': 'PRESENTATION',
61    },
62    'Crossing_the_text_image_boundary': {
63        'title': 'Crossing the Text-Image Boundary: The French HP',
64        'author': 'Unknown',
65        'doc_type': 'SCHOLARSHIP',
66    },
67    'Dream_Narratives_and_Initiation_Processe': {
68        'title': 'Dream Narratives and Initiation Processes',
69        'author': 'Unknown',
70        'doc_type': 'SCHOLARSHIP',
71    },
72    'Editions SR 25 James Gollnick': {
73        'title': 'Religious Dreamworld of Apuleius Metamorphoses: Recovering a Forgotten Hermeneutic',
74        'author': 'James Gollnick',
75        'doc_type': 'SCHOLARSHIP',
76    },
77    'Elucidating_and_Enigmatizing_the_Recepti': {
78        'title': 'Elucidating and Enigmatizing the Reception of the HP',
79        'author': 'Unknown',
80        'doc_type': 'SCHOLARSHIP',
81    },
82    'Eugenio Canone_ Leen Spruit': {
83        'title': 'Emblematics in the Early Modern Age: Case Studies',
84        'author': 'Eugenio Canone and Leen Spruit',
85        'doc_type': 'SCHOLARSHIP',
86    },
87    'Georg Leidinger Albrecht D': {
88        'title': 'Albrecht Durer und die Hypnerotomachia Poliphili',
89        'author': 'Georg Leidinger',
90        'doc_type': 'SCHOLARSHIP',
91    },
92    'Italica 1947': {
93        'title': 'Some Foreign Imitators of the Hypnerotomachia Poliphili',
94        'author': 'Mario Praz',
95        'year': 1947,
96        'journal': 'Italica 24:1',
97        'doc_type': 'SCHOLARSHIP',
98    },
99    'Journal of the Warburg Institute 1937': {
100        'title': 'The Hypnerotomachia Poliphili in 17th Century France',
101        'author': 'Anthony Blunt',
102        'year': 1937,
103        'journal': 'Journal of the Warburg Institute 1:2',
104        'doc_type': 'SCHOLARSHIP',
105    },
106    'Journal of the Warburg and Courtauld Institutes vol 47': {
107        'title': 'Alberti and the Hypnerotomachia Poliphili',
108        'author': 'D. R. Edward Wright',
109        'year': 1984,
110        'journal': 'Journal of the Warburg and Courtauld Institutes 47',
111        'doc_type': 'SCHOLARSHIP',
112    },
113    'Liane Lefaivre': {
114        'title': "Leon Battista Alberti's Hypnerotomachia Poliphili: Re-Cognizing the Architectural Body",
115        'author': 'Liane Lefaivre',
116        'year': 1997,
117        'doc_type': 'SCHOLARSHIP',
118    },
119    'Notes and Queries 1952': {
120        'title': 'Some Notes on the Vocabulary of the Hypnerotomachia Poliphili',
121        'author': 'Peter Ure',
122        'year': 1952,
123        'journal': 'Notes and Queries 197:26',
124        'doc_type': 'SCHOLARSHIP',
125    },
126    'Renaissance Quarterly vol 55': {
127        'title': 'The Hypnerotomachia Poliphili, Image, Text, and Vernacular Poetics',
128        'author': 'Rosemary Trippe',
129        'year': 2002,
130        'journal': 'Renaissance Quarterly 55:4',
131        'doc_type': 'SCHOLARSHIP',
132    },
133    'Renaissance Studies 1990': {
134        'title': 'The Structural Problematic of the Hypnerotomachia Poliphili',
135        'author': 'Mark Jarzombek',
136        'year': 1990,
137        'journal': 'Renaissance Studies 4:3',
138        'doc_type': 'SCHOLARSHIP',
139    },
140    'Studies in Philology 2006': {
141        'title': "Robert Dallington's Hypnerotomachia and the Protestant Antiquity of Elizabethan England",
142        'author': 'L. E. Semler',
143        'year': 2006,
144        'journal': 'Studies in Philology 103:2',
145        'doc_type': 'SCHOLARSHIP',
146    },
147    'Teaching_Eros': {
148        'title': 'Teaching Eros: The Rhetoric of Love in the Hypnerotomachia Poliphili',
149        'author': 'Unknown',
150        'doc_type': 'SCHOLARSHIP',
151    },
152    'The Modern Language Review 1955': {
153        'title': 'Francesco Colonna and Rabelais',
154        'author': 'Marcel Francon',
155        'year': 1955,
156        'journal': 'The Modern Language Review 50:1',
157        'doc_type': 'SCHOLARSHIP',
158    },
159    'The_Narrative_Function_of_Hieroglyphs': {
160        'title': 'The Narrative Function of Hieroglyphs in the Hypnerotomachia Poliphili',
161        'author': 'Unknown',
162        'doc_type': 'SCHOLARSHIP',
163    },
164    'Untangling the knot': {
165        'title': "Untangling the Knot: Garden Design in Francesco Colonna's Hypnerotomachia Poliphili",
166        'author': 'Unknown',
167        'doc_type': 'SCHOLARSHIP',
168    },
169    'Walking_in_the_Boboli': {
170        'title': 'Walking in the Boboli Gardens in Florence',
171        'author': 'Unknown',
172        'doc_type': 'SCHOLARSHIP',
173    },
174    'Word Image 1998 jan vol 14 iss 1 2 Bury John': {
175        'title': 'Chapter III of the Hypnerotomachia Poliphili and the Antiquarian Culture of the Quattrocento',
176        'author': 'John Bury',
177        'year': 1998,
178        'journal': 'Word & Image 14:1-2',
179        'doc_type': 'SCHOLARSHIP',
180    },
181    'Word Image 1998 jan vol 14 iss 1 2 Curran Brian': {
182        'title': 'The Hypnerotomachia Poliphili and Renaissance Egyptology',
183        'author': 'Brian A. Curran',
184        'year': 1998,
185        'journal': 'Word & Image 14:1-2',
186        'doc_type': 'SCHOLARSHIP',
187    },
188    'Word Image 1998 jan vol 14 iss 1 2 Griggs Tamara': {
189        'title': "Promoting the Past: The Hypnerotomachia Poliphili as Antiquarian Enterprise",
190        'author': 'Tamara Griggs',
191        'year': 1998,
192        'journal': 'Word & Image 14:1-2',
193        'doc_type': 'SCHOLARSHIP',
194    },
195    'Word Image 1998 jan vol 14 iss 1 2 Hunt John Dixon': {
196        'title': 'Experiencing Gardens in the Hypnerotomachia Poliphili',
197        'author': 'John Dixon Hunt',
198        'year': 1998,
199        'journal': 'Word & Image 14:1-2',
200        'doc_type': 'SCHOLARSHIP',
201    },
202    'Word Image 1998 jan vol 14 iss 1 2 Leslie Michael': {
203        'title': 'The Hypnerotomachia Poliphili and the Elizabethan Landscape Entertainment',
204        'author': 'Michael Leslie',
205        'year': 1998,
206        'journal': 'Word & Image 14:1-2',
207        'doc_type': 'SCHOLARSHIP',
208    },
209    'Word Image 1998 jan vol 14 iss 1 2 Stewering Roswitha': {
210        'title': 'The Relationship between Text and Woodcuts in the Hypnerotomachia Poliphili',
211        'author': 'Roswitha Stewering (trans. Lorna Maher)',
212        'year': 1998,
213        'journal': 'Word & Image 14:1-2',
214        'doc_type': 'SCHOLARSHIP',
215    },
216    'Word Image 1998 jan vol 14 iss 1 2 Temple N': {
217        'title': 'The Hypnerotomachia Poliphili as a Possible Model for Garden Design',
218        'author': 'N. Temple',
219        'year': 1998,
220        'journal': 'Word & Image 14:1-2',
221        'doc_type': 'SCHOLARSHIP',
222    },
223    'Word Image 2015 apr 03 vol 31 iss 2 Fabiani Giannetto': {
224        'title': "Not Before Either: The Hypnerotomachia Poliphili and the Villa d'Este at Tivoli",
225        'author': 'Raffaella Fabiani Giannetto',
226        'year': 2015,
227        'journal': 'Word & Image 31:2',
228        'doc_type': 'SCHOLARSHIP',
229    },
230    'Word Image 2015 apr 03 vol 31 iss 2 Farrington Lynne': {
231        'title': "Though I Could Lead a Quiet Life: The Hypnerotomachia Poliphili in English Translation",
232        'author': 'Lynne Farrington',
233        'year': 2015,
234        'journal': 'Word & Image 31:2',
235        'doc_type': 'SCHOLARSHIP',
236    },
237    'Word Image 2015 apr 03 vol 31 iss 2 Keller William': {
238        'title': 'Hypnerotomachia Joins the Party: Reading across Word and Image',
239        'author': 'William B. Keller',
240        'year': 2015,
241        'journal': 'Word & Image 31:2',
242        'doc_type': 'SCHOLARSHIP',
243    },
244    'Word Image 2015 apr 03 vol 31 iss 2 Nygren Christopher': {
245        'title': 'The Hypnerotomachia Poliphili and the Woodcut as Mirror',
246        'author': 'Christopher J. Nygren',
247        'year': 2015,
248        'journal': 'Word & Image 31:2',
249        'doc_type': 'SCHOLARSHIP',
250    },
251    'Word Image 2015 apr 03 vol 31 iss 2 Pumroy Eric': {
252        'title': "Bryn Mawr College's 1499 Edition of the Hypnerotomachia Poliphili",
253        'author': 'Eric L. Pumroy',
254        'year': 2015,
255        'journal': 'Word & Image 31:2',
256        'doc_type': 'SCHOLARSHIP',
257    },
258}
259
260
261def find_metadata(filename):
262    """Match filename against known metadata."""
263    for key, meta in KNOWN_METADATA.items():
264        if key in filename:
265            return dict(meta)
266    return {}
267
268
269def clean_filename(filename):
270    """Create a clean slug from filename for the markdown file."""
271    stem = Path(filename).stem
272    # Remove common junk
273    stem = re.sub(r'-20\d{5}T\d+Z.*', '', stem)
274    stem = re.sub(r'\s*\(\d+\)\s*', '', stem)
275    # Replace spaces and special chars
276    slug = re.sub(r'[^\w]+', '_', stem)
277    slug = re.sub(r'_+', '_', slug).strip('_')
278    # Truncate
279    if len(slug) > 80:
280        slug = slug[:80].rstrip('_')
281    return slug
282
283
284def extract_pdf_text(pdf_path):
285    """Extract text from PDF, return list of (page_num, text) tuples."""
286    doc = fitz.open(str(pdf_path))
287    pages = []
288    for i, page in enumerate(doc):
289        text = page.get_text()
290        if text.strip():
291            pages.append((i + 1, text))
292    doc.close()
293    return pages
294
295
296def pages_to_markdown(pages):
297    """Convert extracted pages to a single markdown string."""
298    parts = []
299    for page_num, text in pages:
300        # Clean up common PDF extraction artifacts
301        text = re.sub(r'\n{3,}', '\n\n', text)
302        # Remove page headers/footers (single short lines at start/end)
303        lines = text.split('\n')
304        if lines and len(lines[0].strip()) < 10:
305            lines = lines[1:]
306        text = '\n'.join(lines)
307        parts.append(f"<!-- Page {page_num} -->\n\n{text}")
308    return '\n\n---\n\n'.join(parts)
309
310
311def write_markdown(output_path, metadata, content, page_count):
312    """Write markdown file with YAML frontmatter."""
313    frontmatter_lines = ['---']
314    for key in ['title', 'author', 'year', 'journal', 'source', 'doc_type']:
315        if key in metadata and metadata[key]:
316            val = metadata[key]
317            if isinstance(val, str) and ':' in val:
318                val = f'"{val}"'
319            elif isinstance(val, str):
320                val = f'"{val}"'
321            frontmatter_lines.append(f'{key}: {val}')
322    frontmatter_lines.append(f'page_count: {page_count}')
323    frontmatter_lines.append('---')
324    frontmatter = '\n'.join(frontmatter_lines)
325
326    title = metadata.get('title', 'Untitled')
327    header = f"# {title}\n\n"
328
329    with open(output_path, 'w', encoding='utf-8') as f:
330        f.write(f"{frontmatter}\n\n{header}{content}")
331
332
333def main():
334    extensions = {'.pdf'}
335    pdf_files = sorted([f for f in BASE_DIR.iterdir()
336                       if f.suffix.lower() in extensions and f.is_file()])
337
338    print(f"Found {len(pdf_files)} PDFs to convert")
339
340    success = 0
341    empty = 0
342    errors = 0
343
344    for pdf_path in pdf_files:
345        slug = clean_filename(pdf_path.name)
346        output_path = MD_DIR / f"{slug}.md"
347
348        try:
349            pages = extract_pdf_text(pdf_path)
350            if not pages:
351                print(f"  EMPTY: {pdf_path.name}")
352                empty += 1
353                continue
354
355            metadata = find_metadata(pdf_path.name)
356            metadata['source'] = pdf_path.name
357
358            content = pages_to_markdown(pages)
359            write_markdown(output_path, metadata, content, len(pages))
360
361            word_count = len(content.split())
362            print(f"  OK: {slug}.md ({len(pages)} pages, {word_count} words)")
363            success += 1
364
365        except Exception as e:
366            print(f"  ERROR: {pdf_path.name}: {e}")
367            errors += 1
368
369    print(f"\nResults: {success} converted, {empty} empty, {errors} errors")
370
371
372if __name__ == "__main__":
373    main()