Alchemical Hands in the Hypnerotomachia Poliphili

Marginalia, Scholarship & Reception

← All Scripts

Initialize Database

init_db.py — 221 lines

Creates SQLite schema (7 core tables) and catalogs PDFs/documents from the filesystem.

1"""Initialize the Hypnerotomachia Poliphili SQLite database."""
2
3import sqlite3
4import os
5import re
6from pathlib import Path
7
8BASE_DIR = Path(__file__).resolve().parent.parent
9DB_PATH = BASE_DIR / "db" / "hp.db"
10
11SCHEMA = """
12CREATE TABLE IF NOT EXISTS documents (
13    id INTEGER PRIMARY KEY,
14    filename TEXT NOT NULL,
15    title TEXT,
16    author TEXT,
17    year INTEGER,
18    doc_type TEXT CHECK(doc_type IN ('PRIMARY_TEXT','DISSERTATION','SCHOLARSHIP','PRESENTATION')),
19    page_count INTEGER,
20    file_size_bytes INTEGER,
21    has_selectable_text BOOLEAN
22);
23
24CREATE TABLE IF NOT EXISTS manuscripts (
25    id INTEGER PRIMARY KEY,
26    shelfmark TEXT NOT NULL UNIQUE,
27    institution TEXT,
28    city TEXT,
29    description TEXT,
30    image_count INTEGER,
31    image_dir TEXT
32);
33
34CREATE TABLE IF NOT EXISTS images (
35    id INTEGER PRIMARY KEY,
36    manuscript_id INTEGER REFERENCES manuscripts(id),
37    filename TEXT NOT NULL,
38    folio_number TEXT,
39    side TEXT CHECK(side IN ('r','v',NULL)),
40    page_type TEXT CHECK(page_type IN ('PAGE','MARGINALIA_DETAIL','COVER','GUARD','OTHER')),
41    sort_order INTEGER,
42    relative_path TEXT NOT NULL,
43    master_path TEXT,
44    web_path TEXT
45);
46
47CREATE TABLE IF NOT EXISTS signature_map (
48    id INTEGER PRIMARY KEY,
49    signature TEXT NOT NULL UNIQUE,
50    folio_number INTEGER,
51    side TEXT CHECK(side IN ('r','v')),
52    quire TEXT,
53    leaf_in_quire INTEGER
54);
55
56CREATE TABLE IF NOT EXISTS dissertation_refs (
57    id INTEGER PRIMARY KEY,
58    thesis_page INTEGER,
59    signature_ref TEXT,
60    manuscript_shelfmark TEXT,
61    context_text TEXT,
62    marginal_text TEXT,
63    source_text TEXT,
64    ref_type TEXT CHECK(ref_type IN ('MARGINALIA','ILLUSTRATION','TEXT','BINDING','PROVENANCE')),
65    chapter_num INTEGER
66);
67
68CREATE TABLE IF NOT EXISTS matches (
69    id INTEGER PRIMARY KEY,
70    ref_id INTEGER REFERENCES dissertation_refs(id),
71    image_id INTEGER REFERENCES images(id),
72    match_method TEXT CHECK(match_method IN ('SIGNATURE_EXACT','FOLIO_EXACT','MANUAL','VISION_VERIFIED')),
73    confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')),
74    needs_review BOOLEAN DEFAULT 1
75);
76
77CREATE TABLE IF NOT EXISTS image_readings (
78    id INTEGER PRIMARY KEY,
79    image_id INTEGER REFERENCES images(id),
80    phase INTEGER NOT NULL,
81    model TEXT NOT NULL,
82    raw_json TEXT NOT NULL,
83    page_number_read INTEGER,
84    page_number_expected INTEGER,
85    page_number_match BOOLEAN,
86    has_woodcut BOOLEAN,
87    woodcut_description TEXT,
88    has_annotations BOOLEAN,
89    annotation_density TEXT CHECK(annotation_density IN ('LIGHT','MODERATE','HEAVY',NULL)),
90    annotation_locations TEXT,
91    languages_detected TEXT,
92    legible_fragments TEXT,
93    deep_reading_json TEXT,
94    concordance_status TEXT CHECK(concordance_status IN (
95        'CONFIRMED', 'DISCREPANCY', 'UNVERIFIED'
96    )) DEFAULT 'UNVERIFIED',
97    reviewed BOOLEAN DEFAULT 0,
98    reviewed_by TEXT,
99    reviewed_at TEXT,
100    promoted_to TEXT,
101    notes TEXT,
102    created_at TEXT DEFAULT (datetime('now'))
103);
104
105CREATE TABLE IF NOT EXISTS schema_version (
106    version INTEGER PRIMARY KEY,
107    created_at TEXT DEFAULT (datetime('now'))
108);
109"""
110
111# Known document metadata extracted from filenames
112DOC_CLASSIFICATIONS = {
113    'PRIMARY_TEXT': [
114        'Francesco Colonna Hypnerotomachia Poliphili Da Capo',
115        'Francesco Colonna Rino Avesani',
116        'Hypnerotomachia by Francesco Colonna',
117    ],
118    'DISSERTATION': [
119        'PhD_Thesis',
120        'E_Thesis_Durham',
121    ],
122    'PRESENTATION': ['.pptx'],
123}
124
125
126def classify_doc(filename):
127    for doc_type, patterns in DOC_CLASSIFICATIONS.items():
128        for pat in patterns:
129            if pat in filename:
130                return doc_type
131    return 'SCHOLARSHIP'
132
133
134def extract_author_title(filename):
135    """Best-effort extraction of author and title from filename."""
136    name = Path(filename).stem
137    # Remove common suffixes
138    name = re.sub(r'\s*\(\d+\)\s*$', '', name)
139    # Try splitting on known patterns
140    parts = name.split(' - ')
141    if len(parts) == 2:
142        return parts[0].strip(), parts[1].strip()
143    # For space-separated names, take first few words as author guess
144    return None, name
145
146
147def populate_documents(conn):
148    """Scan root dir for PDFs and PPTX files."""
149    cur = conn.cursor()
150    extensions = {'.pdf', '.pptx'}
151    count = 0
152    for f in sorted(BASE_DIR.iterdir()):
153        if f.suffix.lower() in extensions and f.is_file():
154            author, title = extract_author_title(f.name)
155            doc_type = classify_doc(f.name)
156            size = f.stat().st_size
157            cur.execute(
158                """INSERT OR IGNORE INTO documents
159                   (filename, title, author, doc_type, file_size_bytes)
160                   VALUES (?, ?, ?, ?, ?)""",
161                (f.name, title, author, doc_type, size)
162            )
163            count += 1
164    conn.commit()
165    print(f"  Cataloged {count} documents")
166
167
168def populate_manuscripts(conn):
169    """Insert the two known manuscript records."""
170    cur = conn.cursor()
171    manuscripts = [
172        (
173            'C.60.o.12',
174            'British Library',
175            'London',
176            'British Library copy of the 1499 Aldine Hypnerotomachia Poliphili with extensive marginalia',
177            '3 BL C.60.o.12 Photos-20260319T001113Z-3-001/3 BL C.60.o.12 Photos'
178        ),
179        (
180            'O.III.38',
181            'Biblioteca degli Intronati',
182            'Siena',
183            'Siena copy of the 1499 Aldine Hypnerotomachia Poliphili, digital facsimile',
184            'Siena O.III.38 Digital Facsimile-20260319T001110Z-3-001/Siena O.III.38 Digital Facsimile'
185        ),
186    ]
187    for shelfmark, institution, city, desc, image_dir in manuscripts:
188        # Count images
189        img_path = BASE_DIR / image_dir
190        img_count = len(list(img_path.glob('*.jpg'))) if img_path.exists() else 0
191        cur.execute(
192            """INSERT OR IGNORE INTO manuscripts
193               (shelfmark, institution, city, description, image_count, image_dir)
194               VALUES (?, ?, ?, ?, ?, ?)""",
195            (shelfmark, institution, city, desc, img_count, image_dir)
196        )
197    conn.commit()
198    print("  Cataloged 2 manuscripts")
199
200
201def main():
202    print(f"Initializing database at {DB_PATH}")
203    conn = sqlite3.connect(DB_PATH)
204
205    print("Creating schema...")
206    conn.executescript(SCHEMA)
207    conn.execute("INSERT OR IGNORE INTO schema_version (version) VALUES (1)")
208    conn.commit()
209
210    print("Populating documents...")
211    populate_documents(conn)
212
213    print("Populating manuscripts...")
214    populate_manuscripts(conn)
215
216    conn.close()
217    print("Done.")
218
219
220if __name__ == "__main__":
221    main()