Initialize Database
Creates SQLite schema (7 core tables) and catalogs PDFs/documents from the filesystem.
1"""Initialize the Hypnerotomachia Poliphili SQLite database.""" 2 3import sqlite3 4import os 5import re 6from pathlib import Path 7 8BASE_DIR = Path(__file__).resolve().parent.parent 9DB_PATH = BASE_DIR / "db" / "hp.db" 10 11SCHEMA = """ 12CREATE TABLE IF NOT EXISTS documents ( 13 id INTEGER PRIMARY KEY, 14 filename TEXT NOT NULL, 15 title TEXT, 16 author TEXT, 17 year INTEGER, 18 doc_type TEXT CHECK(doc_type IN ('PRIMARY_TEXT','DISSERTATION','SCHOLARSHIP','PRESENTATION')), 19 page_count INTEGER, 20 file_size_bytes INTEGER, 21 has_selectable_text BOOLEAN 22); 23 24CREATE TABLE IF NOT EXISTS manuscripts ( 25 id INTEGER PRIMARY KEY, 26 shelfmark TEXT NOT NULL UNIQUE, 27 institution TEXT, 28 city TEXT, 29 description TEXT, 30 image_count INTEGER, 31 image_dir TEXT 32); 33 34CREATE TABLE IF NOT EXISTS images ( 35 id INTEGER PRIMARY KEY, 36 manuscript_id INTEGER REFERENCES manuscripts(id), 37 filename TEXT NOT NULL, 38 folio_number TEXT, 39 side TEXT CHECK(side IN ('r','v',NULL)), 40 page_type TEXT CHECK(page_type IN ('PAGE','MARGINALIA_DETAIL','COVER','GUARD','OTHER')), 41 sort_order INTEGER, 42 relative_path TEXT NOT NULL, 43 master_path TEXT, 44 web_path TEXT 45); 46 47CREATE TABLE IF NOT EXISTS signature_map ( 48 id INTEGER PRIMARY KEY, 49 signature TEXT NOT NULL UNIQUE, 50 folio_number INTEGER, 51 side TEXT CHECK(side IN ('r','v')), 52 quire TEXT, 53 leaf_in_quire INTEGER 54); 55 56CREATE TABLE IF NOT EXISTS dissertation_refs ( 57 id INTEGER PRIMARY KEY, 58 thesis_page INTEGER, 59 signature_ref TEXT, 60 manuscript_shelfmark TEXT, 61 context_text TEXT, 62 marginal_text TEXT, 63 source_text TEXT, 64 ref_type TEXT CHECK(ref_type IN ('MARGINALIA','ILLUSTRATION','TEXT','BINDING','PROVENANCE')), 65 chapter_num INTEGER 66); 67 68CREATE TABLE IF NOT EXISTS matches ( 69 id INTEGER PRIMARY KEY, 70 ref_id INTEGER REFERENCES dissertation_refs(id), 71 image_id INTEGER REFERENCES images(id), 72 match_method TEXT CHECK(match_method IN ('SIGNATURE_EXACT','FOLIO_EXACT','MANUAL','VISION_VERIFIED')), 73 confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')), 74 needs_review BOOLEAN DEFAULT 1 75); 76 77CREATE TABLE IF NOT EXISTS image_readings ( 78 id INTEGER PRIMARY KEY, 79 image_id INTEGER REFERENCES images(id), 80 phase INTEGER NOT NULL, 81 model TEXT NOT NULL, 82 raw_json TEXT NOT NULL, 83 page_number_read INTEGER, 84 page_number_expected INTEGER, 85 page_number_match BOOLEAN, 86 has_woodcut BOOLEAN, 87 woodcut_description TEXT, 88 has_annotations BOOLEAN, 89 annotation_density TEXT CHECK(annotation_density IN ('LIGHT','MODERATE','HEAVY',NULL)), 90 annotation_locations TEXT, 91 languages_detected TEXT, 92 legible_fragments TEXT, 93 deep_reading_json TEXT, 94 concordance_status TEXT CHECK(concordance_status IN ( 95 'CONFIRMED', 'DISCREPANCY', 'UNVERIFIED' 96 )) DEFAULT 'UNVERIFIED', 97 reviewed BOOLEAN DEFAULT 0, 98 reviewed_by TEXT, 99 reviewed_at TEXT, 100 promoted_to TEXT, 101 notes TEXT, 102 created_at TEXT DEFAULT (datetime('now')) 103); 104 105CREATE TABLE IF NOT EXISTS schema_version ( 106 version INTEGER PRIMARY KEY, 107 created_at TEXT DEFAULT (datetime('now')) 108); 109""" 110 111# Known document metadata extracted from filenames 112DOC_CLASSIFICATIONS = { 113 'PRIMARY_TEXT': [ 114 'Francesco Colonna Hypnerotomachia Poliphili Da Capo', 115 'Francesco Colonna Rino Avesani', 116 'Hypnerotomachia by Francesco Colonna', 117 ], 118 'DISSERTATION': [ 119 'PhD_Thesis', 120 'E_Thesis_Durham', 121 ], 122 'PRESENTATION': ['.pptx'], 123} 124 125 126def classify_doc(filename): 127 for doc_type, patterns in DOC_CLASSIFICATIONS.items(): 128 for pat in patterns: 129 if pat in filename: 130 return doc_type 131 return 'SCHOLARSHIP' 132 133 134def extract_author_title(filename): 135 """Best-effort extraction of author and title from filename.""" 136 name = Path(filename).stem 137 # Remove common suffixes 138 name = re.sub(r'\s*\(\d+\)\s*$', '', name) 139 # Try splitting on known patterns 140 parts = name.split(' - ') 141 if len(parts) == 2: 142 return parts[0].strip(), parts[1].strip() 143 # For space-separated names, take first few words as author guess 144 return None, name 145 146 147def populate_documents(conn): 148 """Scan root dir for PDFs and PPTX files.""" 149 cur = conn.cursor() 150 extensions = {'.pdf', '.pptx'} 151 count = 0 152 for f in sorted(BASE_DIR.iterdir()): 153 if f.suffix.lower() in extensions and f.is_file(): 154 author, title = extract_author_title(f.name) 155 doc_type = classify_doc(f.name) 156 size = f.stat().st_size 157 cur.execute( 158 """INSERT OR IGNORE INTO documents 159 (filename, title, author, doc_type, file_size_bytes) 160 VALUES (?, ?, ?, ?, ?)""", 161 (f.name, title, author, doc_type, size) 162 ) 163 count += 1 164 conn.commit() 165 print(f" Cataloged {count} documents") 166 167 168def populate_manuscripts(conn): 169 """Insert the two known manuscript records.""" 170 cur = conn.cursor() 171 manuscripts = [ 172 ( 173 'C.60.o.12', 174 'British Library', 175 'London', 176 'British Library copy of the 1499 Aldine Hypnerotomachia Poliphili with extensive marginalia', 177 '3 BL C.60.o.12 Photos-20260319T001113Z-3-001/3 BL C.60.o.12 Photos' 178 ), 179 ( 180 'O.III.38', 181 'Biblioteca degli Intronati', 182 'Siena', 183 'Siena copy of the 1499 Aldine Hypnerotomachia Poliphili, digital facsimile', 184 'Siena O.III.38 Digital Facsimile-20260319T001110Z-3-001/Siena O.III.38 Digital Facsimile' 185 ), 186 ] 187 for shelfmark, institution, city, desc, image_dir in manuscripts: 188 # Count images 189 img_path = BASE_DIR / image_dir 190 img_count = len(list(img_path.glob('*.jpg'))) if img_path.exists() else 0 191 cur.execute( 192 """INSERT OR IGNORE INTO manuscripts 193 (shelfmark, institution, city, description, image_count, image_dir) 194 VALUES (?, ?, ?, ?, ?, ?)""", 195 (shelfmark, institution, city, desc, img_count, image_dir) 196 ) 197 conn.commit() 198 print(" Cataloged 2 manuscripts") 199 200 201def main(): 202 print(f"Initializing database at {DB_PATH}") 203 conn = sqlite3.connect(DB_PATH) 204 205 print("Creating schema...") 206 conn.executescript(SCHEMA) 207 conn.execute("INSERT OR IGNORE INTO schema_version (version) VALUES (1)") 208 conn.commit() 209 210 print("Populating documents...") 211 populate_documents(conn) 212 213 print("Populating manuscripts...") 214 populate_manuscripts(conn) 215 216 conn.close() 217 print("Done.") 218 219 220if __name__ == "__main__": 221 main()