PDF to Markdown
Extracts all PDFs to markdown with YAML frontmatter, page markers, and metadata lookup.
1"""Extract all PDFs to markdown files with YAML frontmatter.""" 2 3import re 4import os 5from pathlib import Path 6 7try: 8 import fitz # PyMuPDF 9except ImportError: 10 print("ERROR: PyMuPDF not installed. Run: pip install PyMuPDF") 11 raise 12 13BASE_DIR = Path(__file__).resolve().parent.parent 14MD_DIR = BASE_DIR / "md" 15MD_DIR.mkdir(exist_ok=True) 16 17# Known metadata mappings for better frontmatter 18KNOWN_METADATA = { 19 'PhD_Thesis_ _James_Russell': { 20 'title': 'Hypnerotomachia Poliphili: A Study of Marginal Annotations in Six Copies', 21 'author': 'James Russell', 22 'year': 2024, 23 'doc_type': 'DISSERTATION', 24 }, 25 'E_Thesis_Durham_University_Self_Transfor Oneill': { 26 'title': 'Self-transformation in the Hypnerotomachia Poliphili', 27 'author': "James O'Neill", 28 'year': 2025, 29 'journal': 'Durham University (E-Thesis)', 30 'doc_type': 'DISSERTATION', 31 }, 32 'A_Narrative_in_Search_of_an_Author': { 33 'title': 'A Narrative in Search of an Author: The Hypnerotomachia Poliphili', 34 'author': "James O'Neill", 35 'year': 2025, 36 'doc_type': 'SCHOLARSHIP', 37 }, 38 'Francesco Colonna Hypnerotomachia Poliphili Da Capo': { 39 'title': 'Hypnerotomachia Poliphili (Da Capo Press edition)', 40 'author': 'Francesco Colonna', 41 'year': 1499, 42 'doc_type': 'PRIMARY_TEXT', 43 }, 44 'Francesco Colonna Rino Avesani': { 45 'title': 'Hypnerotomachia Poliphili, Vol. 1 (Antenore critical edition)', 46 'author': 'Francesco Colonna (ed. Pozzi & Ciapponi)', 47 'year': 1964, 48 'doc_type': 'PRIMARY_TEXT', 49 }, 50 'Hypnerotomachia by Francesco Colonna': { 51 'title': 'Hypnerotomachia Poliphili', 52 'author': 'Francesco Colonna', 53 'year': 1499, 54 'doc_type': 'PRIMARY_TEXT', 55 }, 56 'The HP of Ben Jonson': { 57 'title': 'The HP of Ben Jonson and Kenelm Digby', 58 'author': 'Unknown', 59 'year': 2025, 60 'doc_type': 'PRESENTATION', 61 }, 62 'Crossing_the_text_image_boundary': { 63 'title': 'Crossing the Text-Image Boundary: The French HP', 64 'author': 'Unknown', 65 'doc_type': 'SCHOLARSHIP', 66 }, 67 'Dream_Narratives_and_Initiation_Processe': { 68 'title': 'Dream Narratives and Initiation Processes', 69 'author': 'Unknown', 70 'doc_type': 'SCHOLARSHIP', 71 }, 72 'Editions SR 25 James Gollnick': { 73 'title': 'Religious Dreamworld of Apuleius Metamorphoses: Recovering a Forgotten Hermeneutic', 74 'author': 'James Gollnick', 75 'doc_type': 'SCHOLARSHIP', 76 }, 77 'Elucidating_and_Enigmatizing_the_Recepti': { 78 'title': 'Elucidating and Enigmatizing the Reception of the HP', 79 'author': 'Unknown', 80 'doc_type': 'SCHOLARSHIP', 81 }, 82 'Eugenio Canone_ Leen Spruit': { 83 'title': 'Emblematics in the Early Modern Age: Case Studies', 84 'author': 'Eugenio Canone and Leen Spruit', 85 'doc_type': 'SCHOLARSHIP', 86 }, 87 'Georg Leidinger Albrecht D': { 88 'title': 'Albrecht Durer und die Hypnerotomachia Poliphili', 89 'author': 'Georg Leidinger', 90 'doc_type': 'SCHOLARSHIP', 91 }, 92 'Italica 1947': { 93 'title': 'Some Foreign Imitators of the Hypnerotomachia Poliphili', 94 'author': 'Mario Praz', 95 'year': 1947, 96 'journal': 'Italica 24:1', 97 'doc_type': 'SCHOLARSHIP', 98 }, 99 'Journal of the Warburg Institute 1937': { 100 'title': 'The Hypnerotomachia Poliphili in 17th Century France', 101 'author': 'Anthony Blunt', 102 'year': 1937, 103 'journal': 'Journal of the Warburg Institute 1:2', 104 'doc_type': 'SCHOLARSHIP', 105 }, 106 'Journal of the Warburg and Courtauld Institutes vol 47': { 107 'title': 'Alberti and the Hypnerotomachia Poliphili', 108 'author': 'D. R. Edward Wright', 109 'year': 1984, 110 'journal': 'Journal of the Warburg and Courtauld Institutes 47', 111 'doc_type': 'SCHOLARSHIP', 112 }, 113 'Liane Lefaivre': { 114 'title': "Leon Battista Alberti's Hypnerotomachia Poliphili: Re-Cognizing the Architectural Body", 115 'author': 'Liane Lefaivre', 116 'year': 1997, 117 'doc_type': 'SCHOLARSHIP', 118 }, 119 'Notes and Queries 1952': { 120 'title': 'Some Notes on the Vocabulary of the Hypnerotomachia Poliphili', 121 'author': 'Peter Ure', 122 'year': 1952, 123 'journal': 'Notes and Queries 197:26', 124 'doc_type': 'SCHOLARSHIP', 125 }, 126 'Renaissance Quarterly vol 55': { 127 'title': 'The Hypnerotomachia Poliphili, Image, Text, and Vernacular Poetics', 128 'author': 'Rosemary Trippe', 129 'year': 2002, 130 'journal': 'Renaissance Quarterly 55:4', 131 'doc_type': 'SCHOLARSHIP', 132 }, 133 'Renaissance Studies 1990': { 134 'title': 'The Structural Problematic of the Hypnerotomachia Poliphili', 135 'author': 'Mark Jarzombek', 136 'year': 1990, 137 'journal': 'Renaissance Studies 4:3', 138 'doc_type': 'SCHOLARSHIP', 139 }, 140 'Studies in Philology 2006': { 141 'title': "Robert Dallington's Hypnerotomachia and the Protestant Antiquity of Elizabethan England", 142 'author': 'L. E. Semler', 143 'year': 2006, 144 'journal': 'Studies in Philology 103:2', 145 'doc_type': 'SCHOLARSHIP', 146 }, 147 'Teaching_Eros': { 148 'title': 'Teaching Eros: The Rhetoric of Love in the Hypnerotomachia Poliphili', 149 'author': 'Unknown', 150 'doc_type': 'SCHOLARSHIP', 151 }, 152 'The Modern Language Review 1955': { 153 'title': 'Francesco Colonna and Rabelais', 154 'author': 'Marcel Francon', 155 'year': 1955, 156 'journal': 'The Modern Language Review 50:1', 157 'doc_type': 'SCHOLARSHIP', 158 }, 159 'The_Narrative_Function_of_Hieroglyphs': { 160 'title': 'The Narrative Function of Hieroglyphs in the Hypnerotomachia Poliphili', 161 'author': 'Unknown', 162 'doc_type': 'SCHOLARSHIP', 163 }, 164 'Untangling the knot': { 165 'title': "Untangling the Knot: Garden Design in Francesco Colonna's Hypnerotomachia Poliphili", 166 'author': 'Unknown', 167 'doc_type': 'SCHOLARSHIP', 168 }, 169 'Walking_in_the_Boboli': { 170 'title': 'Walking in the Boboli Gardens in Florence', 171 'author': 'Unknown', 172 'doc_type': 'SCHOLARSHIP', 173 }, 174 'Word Image 1998 jan vol 14 iss 1 2 Bury John': { 175 'title': 'Chapter III of the Hypnerotomachia Poliphili and the Antiquarian Culture of the Quattrocento', 176 'author': 'John Bury', 177 'year': 1998, 178 'journal': 'Word & Image 14:1-2', 179 'doc_type': 'SCHOLARSHIP', 180 }, 181 'Word Image 1998 jan vol 14 iss 1 2 Curran Brian': { 182 'title': 'The Hypnerotomachia Poliphili and Renaissance Egyptology', 183 'author': 'Brian A. Curran', 184 'year': 1998, 185 'journal': 'Word & Image 14:1-2', 186 'doc_type': 'SCHOLARSHIP', 187 }, 188 'Word Image 1998 jan vol 14 iss 1 2 Griggs Tamara': { 189 'title': "Promoting the Past: The Hypnerotomachia Poliphili as Antiquarian Enterprise", 190 'author': 'Tamara Griggs', 191 'year': 1998, 192 'journal': 'Word & Image 14:1-2', 193 'doc_type': 'SCHOLARSHIP', 194 }, 195 'Word Image 1998 jan vol 14 iss 1 2 Hunt John Dixon': { 196 'title': 'Experiencing Gardens in the Hypnerotomachia Poliphili', 197 'author': 'John Dixon Hunt', 198 'year': 1998, 199 'journal': 'Word & Image 14:1-2', 200 'doc_type': 'SCHOLARSHIP', 201 }, 202 'Word Image 1998 jan vol 14 iss 1 2 Leslie Michael': { 203 'title': 'The Hypnerotomachia Poliphili and the Elizabethan Landscape Entertainment', 204 'author': 'Michael Leslie', 205 'year': 1998, 206 'journal': 'Word & Image 14:1-2', 207 'doc_type': 'SCHOLARSHIP', 208 }, 209 'Word Image 1998 jan vol 14 iss 1 2 Stewering Roswitha': { 210 'title': 'The Relationship between Text and Woodcuts in the Hypnerotomachia Poliphili', 211 'author': 'Roswitha Stewering (trans. Lorna Maher)', 212 'year': 1998, 213 'journal': 'Word & Image 14:1-2', 214 'doc_type': 'SCHOLARSHIP', 215 }, 216 'Word Image 1998 jan vol 14 iss 1 2 Temple N': { 217 'title': 'The Hypnerotomachia Poliphili as a Possible Model for Garden Design', 218 'author': 'N. Temple', 219 'year': 1998, 220 'journal': 'Word & Image 14:1-2', 221 'doc_type': 'SCHOLARSHIP', 222 }, 223 'Word Image 2015 apr 03 vol 31 iss 2 Fabiani Giannetto': { 224 'title': "Not Before Either: The Hypnerotomachia Poliphili and the Villa d'Este at Tivoli", 225 'author': 'Raffaella Fabiani Giannetto', 226 'year': 2015, 227 'journal': 'Word & Image 31:2', 228 'doc_type': 'SCHOLARSHIP', 229 }, 230 'Word Image 2015 apr 03 vol 31 iss 2 Farrington Lynne': { 231 'title': "Though I Could Lead a Quiet Life: The Hypnerotomachia Poliphili in English Translation", 232 'author': 'Lynne Farrington', 233 'year': 2015, 234 'journal': 'Word & Image 31:2', 235 'doc_type': 'SCHOLARSHIP', 236 }, 237 'Word Image 2015 apr 03 vol 31 iss 2 Keller William': { 238 'title': 'Hypnerotomachia Joins the Party: Reading across Word and Image', 239 'author': 'William B. Keller', 240 'year': 2015, 241 'journal': 'Word & Image 31:2', 242 'doc_type': 'SCHOLARSHIP', 243 }, 244 'Word Image 2015 apr 03 vol 31 iss 2 Nygren Christopher': { 245 'title': 'The Hypnerotomachia Poliphili and the Woodcut as Mirror', 246 'author': 'Christopher J. Nygren', 247 'year': 2015, 248 'journal': 'Word & Image 31:2', 249 'doc_type': 'SCHOLARSHIP', 250 }, 251 'Word Image 2015 apr 03 vol 31 iss 2 Pumroy Eric': { 252 'title': "Bryn Mawr College's 1499 Edition of the Hypnerotomachia Poliphili", 253 'author': 'Eric L. Pumroy', 254 'year': 2015, 255 'journal': 'Word & Image 31:2', 256 'doc_type': 'SCHOLARSHIP', 257 }, 258} 259 260 261def find_metadata(filename): 262 """Match filename against known metadata.""" 263 for key, meta in KNOWN_METADATA.items(): 264 if key in filename: 265 return dict(meta) 266 return {} 267 268 269def clean_filename(filename): 270 """Create a clean slug from filename for the markdown file.""" 271 stem = Path(filename).stem 272 # Remove common junk 273 stem = re.sub(r'-20\d{5}T\d+Z.*', '', stem) 274 stem = re.sub(r'\s*\(\d+\)\s*', '', stem) 275 # Replace spaces and special chars 276 slug = re.sub(r'[^\w]+', '_', stem) 277 slug = re.sub(r'_+', '_', slug).strip('_') 278 # Truncate 279 if len(slug) > 80: 280 slug = slug[:80].rstrip('_') 281 return slug 282 283 284def extract_pdf_text(pdf_path): 285 """Extract text from PDF, return list of (page_num, text) tuples.""" 286 doc = fitz.open(str(pdf_path)) 287 pages = [] 288 for i, page in enumerate(doc): 289 text = page.get_text() 290 if text.strip(): 291 pages.append((i + 1, text)) 292 doc.close() 293 return pages 294 295 296def pages_to_markdown(pages): 297 """Convert extracted pages to a single markdown string.""" 298 parts = [] 299 for page_num, text in pages: 300 # Clean up common PDF extraction artifacts 301 text = re.sub(r'\n{3,}', '\n\n', text) 302 # Remove page headers/footers (single short lines at start/end) 303 lines = text.split('\n') 304 if lines and len(lines[0].strip()) < 10: 305 lines = lines[1:] 306 text = '\n'.join(lines) 307 parts.append(f"<!-- Page {page_num} -->\n\n{text}") 308 return '\n\n---\n\n'.join(parts) 309 310 311def write_markdown(output_path, metadata, content, page_count): 312 """Write markdown file with YAML frontmatter.""" 313 frontmatter_lines = ['---'] 314 for key in ['title', 'author', 'year', 'journal', 'source', 'doc_type']: 315 if key in metadata and metadata[key]: 316 val = metadata[key] 317 if isinstance(val, str) and ':' in val: 318 val = f'"{val}"' 319 elif isinstance(val, str): 320 val = f'"{val}"' 321 frontmatter_lines.append(f'{key}: {val}') 322 frontmatter_lines.append(f'page_count: {page_count}') 323 frontmatter_lines.append('---') 324 frontmatter = '\n'.join(frontmatter_lines) 325 326 title = metadata.get('title', 'Untitled') 327 header = f"# {title}\n\n" 328 329 with open(output_path, 'w', encoding='utf-8') as f: 330 f.write(f"{frontmatter}\n\n{header}{content}") 331 332 333def main(): 334 extensions = {'.pdf'} 335 pdf_files = sorted([f for f in BASE_DIR.iterdir() 336 if f.suffix.lower() in extensions and f.is_file()]) 337 338 print(f"Found {len(pdf_files)} PDFs to convert") 339 340 success = 0 341 empty = 0 342 errors = 0 343 344 for pdf_path in pdf_files: 345 slug = clean_filename(pdf_path.name) 346 output_path = MD_DIR / f"{slug}.md" 347 348 try: 349 pages = extract_pdf_text(pdf_path) 350 if not pages: 351 print(f" EMPTY: {pdf_path.name}") 352 empty += 1 353 continue 354 355 metadata = find_metadata(pdf_path.name) 356 metadata['source'] = pdf_path.name 357 358 content = pages_to_markdown(pages) 359 write_markdown(output_path, metadata, content, len(pages)) 360 361 word_count = len(content.split()) 362 print(f" OK: {slug}.md ({len(pages)} pages, {word_count} words)") 363 success += 1 364 365 except Exception as e: 366 print(f" ERROR: {pdf_path.name}: {e}") 367 errors += 1 368 369 print(f"\nResults: {success} converted, {empty} empty, {errors} errors") 370 371 372if __name__ == "__main__": 373 main()