Alchemical Hands in the Hypnerotomachia Poliphili

Marginalia, Scholarship & Reception

← All Scripts

Schema Migration V2

migrate_v2.py — 389 lines

Adds annotations, annotators, doc_folio_refs, dictionary tables, review/provenance columns. Downgrades BL confidence.

1"""Schema migration v2: Harden data model with review/provenance fields,
2normalize tables, add dictionary and annotations support.
3
4Idempotent — safe to run multiple times.
5"""
6
7import sqlite3
8import json
9from pathlib import Path
10
11BASE_DIR = Path(__file__).resolve().parent.parent
12DB_PATH = BASE_DIR / "db" / "hp.db"
13
14MIGRATION_SQL = """
15-- ============================================================
16-- V2 MIGRATION: Hardened schema with review/provenance fields
17-- ============================================================
18
19-- 1. Annotations table: first-class representation of marginal notes
20CREATE TABLE IF NOT EXISTS annotations (
21    id INTEGER PRIMARY KEY,
22    manuscript_id INTEGER REFERENCES manuscripts(id),
23    hand_id INTEGER REFERENCES annotator_hands(id),
24    signature_ref TEXT,
25    folio_number INTEGER,
26    side TEXT CHECK(side IN ('r','v',NULL)),
27    annotation_text TEXT,
28    annotation_type TEXT CHECK(annotation_type IN (
29        'MARGINAL_NOTE','LABEL','SYMBOL','UNDERLINE','DRAWING',
30        'CROSS_REFERENCE','INDEX_ENTRY','PROVENANCE','EMENDATION','OTHER'
31    )),
32    language TEXT,
33    thesis_page INTEGER,
34    thesis_chapter INTEGER,
35    confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')) DEFAULT 'PROVISIONAL',
36    needs_review BOOLEAN DEFAULT 1,
37    reviewed BOOLEAN DEFAULT 0,
38    reviewed_by TEXT,
39    reviewed_at TEXT,
40    source_method TEXT CHECK(source_method IN (
41        'MANUAL_TRANSCRIPTION','PDF_EXTRACTION','LLM_ASSISTED','INFERRED'
42    )) DEFAULT 'INFERRED',
43    notes TEXT
44);
45
46-- 2. Rename/normalize annotator_hands → annotators (keep old table, add view)
47CREATE TABLE IF NOT EXISTS annotators (
48    id INTEGER PRIMARY KEY,
49    hand_label TEXT NOT NULL,
50    manuscript_id INTEGER REFERENCES manuscripts(id),
51    manuscript_shelfmark TEXT NOT NULL,
52    attribution TEXT,
53    attribution_confidence TEXT CHECK(attribution_confidence IN ('CERTAIN','PROBABLE','POSSIBLE','UNKNOWN')) DEFAULT 'POSSIBLE',
54    language TEXT,
55    ink_color TEXT,
56    date_range TEXT,
57    school TEXT,
58    interests TEXT,
59    description TEXT,
60    is_alchemist BOOLEAN DEFAULT 0,
61    chapter_num INTEGER,
62    source_method TEXT DEFAULT 'LLM_ASSISTED',
63    needs_review BOOLEAN DEFAULT 1,
64    reviewed BOOLEAN DEFAULT 0,
65    notes TEXT,
66    UNIQUE(hand_label, manuscript_shelfmark)
67);
68
69-- 3. doc_folio_refs: references from any document to specific folios
70CREATE TABLE IF NOT EXISTS doc_folio_refs (
71    id INTEGER PRIMARY KEY,
72    document_id INTEGER REFERENCES documents(id),
73    signature_ref TEXT,
74    folio_number INTEGER,
75    side TEXT CHECK(side IN ('r','v',NULL)),
76    manuscript_shelfmark TEXT,
77    page_in_document INTEGER,
78    context_text TEXT,
79    marginal_text TEXT,
80    ref_type TEXT CHECK(ref_type IN (
81        'MARGINALIA','ILLUSTRATION','TEXT','BINDING','PROVENANCE','CROSS_REF'
82    )),
83    chapter_num INTEGER,
84    hand_id INTEGER REFERENCES annotators(id),
85    confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')) DEFAULT 'PROVISIONAL',
86    needs_review BOOLEAN DEFAULT 1,
87    reviewed BOOLEAN DEFAULT 0,
88    source_method TEXT DEFAULT 'PDF_EXTRACTION',
89    notes TEXT
90);
91
92-- 4. Dictionary tables
93CREATE TABLE IF NOT EXISTS dictionary_terms (
94    id INTEGER PRIMARY KEY,
95    slug TEXT NOT NULL UNIQUE,
96    label TEXT NOT NULL,
97    category TEXT NOT NULL,
98    definition_short TEXT NOT NULL,
99    definition_long TEXT,
100    source_basis TEXT,
101    review_status TEXT CHECK(review_status IN ('DRAFT','REVIEWED','VERIFIED')) DEFAULT 'DRAFT',
102    needs_review BOOLEAN DEFAULT 1,
103    reviewed BOOLEAN DEFAULT 0,
104    created_at TEXT DEFAULT (datetime('now')),
105    updated_at TEXT DEFAULT (datetime('now'))
106);
107
108CREATE TABLE IF NOT EXISTS dictionary_term_links (
109    id INTEGER PRIMARY KEY,
110    term_id INTEGER REFERENCES dictionary_terms(id),
111    linked_term_id INTEGER REFERENCES dictionary_terms(id),
112    link_type TEXT CHECK(link_type IN ('RELATED','SEE_ALSO','OPPOSITE','PARENT','CHILD')) DEFAULT 'RELATED',
113    UNIQUE(term_id, linked_term_id, link_type)
114);
115
116-- 5. Topic cluster junction table (multi-value support)
117CREATE TABLE IF NOT EXISTS document_topics (
118    document_id INTEGER,
119    topic TEXT NOT NULL,
120    source_table TEXT NOT NULL CHECK(source_table IN ('bibliography','documents','summaries')),
121    source_id INTEGER NOT NULL,
122    PRIMARY KEY (source_table, source_id, topic)
123);
124
125-- 6. Add review/provenance columns to existing tables where missing
126-- (ALTER TABLE IF NOT EXISTS not supported in SQLite, so we use try/except in Python)
127
128-- 7. Update matches confidence constraint to include PROVISIONAL
129-- (SQLite can't alter constraints, so we handle this in the matching logic)
130
131-- 8. Update schema version
132INSERT OR REPLACE INTO schema_version (version, created_at) VALUES (2, datetime('now'));
133"""
134
135# Columns to add to existing tables (table, column, type, default)
136ALTER_COLUMNS = [
137    ('matches', 'source_method', "TEXT DEFAULT 'FOLIO_EXACT'", None),
138    ('matches', 'reviewed', 'BOOLEAN DEFAULT 0', None),
139    ('matches', 'reviewed_by', 'TEXT', None),
140    ('matches', 'notes', 'TEXT', None),
141    ('documents', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None),
142    ('documents', 'source_method', "TEXT DEFAULT 'FILESYSTEM_SCAN'", None),
143    ('bibliography', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None),
144    ('bibliography', 'needs_review', 'BOOLEAN DEFAULT 1', None),
145    ('bibliography', 'reviewed', 'BOOLEAN DEFAULT 0', None),
146    ('scholars', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None),
147    ('scholars', 'needs_review', 'BOOLEAN DEFAULT 1', None),
148    ('scholars', 'reviewed', 'BOOLEAN DEFAULT 0', None),
149    ('scholars', 'source_method', "TEXT DEFAULT 'LLM_ASSISTED'", None),
150    ('images', 'confidence', "TEXT DEFAULT 'PROVISIONAL'", None),
151    ('images', 'needs_review', 'BOOLEAN DEFAULT 1', None),
152    ('timeline_events', 'needs_review', 'BOOLEAN DEFAULT 1', None),
153    ('timeline_events', 'source_method', "TEXT DEFAULT 'LLM_ASSISTED'", None),
154]
155
156
157def add_column_if_missing(cur, table, column, col_type, default):
158    """Add a column to a table if it doesn't already exist."""
159    cur.execute(f"PRAGMA table_info({table})")
160    existing = {row[1] for row in cur.fetchall()}
161    if column not in existing:
162        sql = f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"
163        cur.execute(sql)
164        return True
165    return False
166
167
168def migrate_annotator_hands(conn):
169    """Copy annotator_hands data into the new annotators table."""
170    cur = conn.cursor()
171    cur.execute("SELECT COUNT(*) FROM annotator_hands")
172    old_count = cur.fetchone()[0]
173    if old_count == 0:
174        return 0
175
176    cur.execute("SELECT COUNT(*) FROM annotators")
177    new_count = cur.fetchone()[0]
178    if new_count > 0:
179        return new_count  # Already migrated
180
181    cur.execute("""
182        INSERT OR IGNORE INTO annotators
183            (hand_label, manuscript_shelfmark, attribution, language,
184             ink_color, date_range, school, interests, description,
185             is_alchemist, chapter_num, source_method, needs_review)
186        SELECT
187            hand_label, manuscript_shelfmark, attribution, language,
188            ink_color, date_range, school, interests, description,
189            is_alchemist, chapter_num, 'LLM_ASSISTED', 1
190        FROM annotator_hands
191    """)
192    conn.commit()
193    cur.execute("SELECT COUNT(*) FROM annotators")
194    return cur.fetchone()[0]
195
196
197def migrate_dissertation_refs(conn):
198    """Copy dissertation_refs into doc_folio_refs with proper provenance."""
199    cur = conn.cursor()
200
201    cur.execute("SELECT COUNT(*) FROM doc_folio_refs")
202    if cur.fetchone()[0] > 0:
203        return  # Already migrated
204
205    # Find the Russell thesis document ID
206    cur.execute("SELECT id FROM documents WHERE filename LIKE '%Russell%' LIMIT 1")
207    row = cur.fetchone()
208    doc_id = row[0] if row else None
209
210    cur.execute("""
211        INSERT INTO doc_folio_refs
212            (document_id, signature_ref, manuscript_shelfmark,
213             page_in_document, context_text, marginal_text,
214             ref_type, chapter_num, hand_id, confidence, source_method)
215        SELECT
216            ?, signature_ref, manuscript_shelfmark,
217            thesis_page, context_text, marginal_text,
218            ref_type, chapter_num, hand_id, 'PROVISIONAL', 'PDF_EXTRACTION'
219        FROM dissertation_refs
220    """, (doc_id,))
221    conn.commit()
222
223
224def downgrade_bl_confidence(conn):
225    """Downgrade all BL C.60.o.12 matches to LOW confidence.
226
227    Rationale: BL copy is 1545 edition; our signature map is based on 1499.
228    Photo numbers assumed to equal folio numbers without manual verification.
229    """
230    cur = conn.cursor()
231
232    # Downgrade matches involving BL images
233    cur.execute("""
234        UPDATE matches SET confidence = 'LOW', needs_review = 1
235        WHERE image_id IN (
236            SELECT i.id FROM images i
237            JOIN manuscripts m ON i.manuscript_id = m.id
238            WHERE m.shelfmark = 'C.60.o.12'
239        ) AND confidence != 'LOW'
240    """)
241    bl_downgraded = cur.rowcount
242
243    # Also update doc_folio_refs for BL
244    cur.execute("""
245        UPDATE doc_folio_refs SET confidence = 'PROVISIONAL'
246        WHERE manuscript_shelfmark = 'C.60.o.12'
247    """)
248
249    conn.commit()
250    return bl_downgraded
251
252
253def import_summaries_to_db(conn):
254    """Import scholars/summaries.json into bibliography and scholars tables
255    with proper provenance tracking."""
256    summaries_path = BASE_DIR / "scholars" / "summaries.json"
257    if not summaries_path.exists():
258        print("  No summaries.json found, skipping")
259        return
260
261    with open(summaries_path, encoding='utf-8') as f:
262        summaries = json.load(f)
263
264    cur = conn.cursor()
265
266    for s in summaries:
267        author = s.get('author', 'Unknown')
268        title = s.get('title', '')
269        year = s.get('year')
270        journal = s.get('journal', '')
271        summary = s.get('summary', '')
272        topic = s.get('topic_cluster', '')
273        filename = s.get('filename', '')
274
275        # Determine pub_type
276        if 'PhD' in journal or 'Thesis' in journal or 'E-Thesis' in journal:
277            pub_type = 'thesis'
278        elif 'Press' in journal or '(' not in journal:
279            pub_type = 'book'
280        else:
281            pub_type = 'article'
282
283        # Upsert into bibliography
284        cur.execute("""
285            INSERT OR IGNORE INTO bibliography
286                (author, title, year, pub_type, journal_or_publisher,
287                 in_collection, collection_filename, hp_relevance,
288                 topic_cluster, notes, review_status, needs_review)
289            VALUES (?, ?, ?, ?, ?, 1, ?, 'DIRECT', ?, ?, 'UNREVIEWED', 1)
290        """, (author, title, str(year) if year else None, pub_type, journal,
291              filename, topic, f'LLM-generated summary: {summary[:200]}...'))
292
293        # Upsert into scholars
294        cur.execute("""
295            INSERT OR IGNORE INTO scholars
296                (name, source_method, needs_review, review_status)
297            VALUES (?, 'LLM_ASSISTED', 1, 'UNREVIEWED')
298        """, (author,))
299
300        # Add topic to junction table
301        if topic:
302            cur.execute("SELECT id FROM bibliography WHERE author=? AND title=?", (author, title))
303            bib_row = cur.fetchone()
304            if bib_row:
305                for t in topic.split(','):
306                    t = t.strip()
307                    if t:
308                        cur.execute("""
309                            INSERT OR IGNORE INTO document_topics
310                                (source_table, source_id, topic)
311                            VALUES ('bibliography', ?, ?)
312                        """, (bib_row[0], t))
313
314    conn.commit()
315    print(f"  Imported {len(summaries)} summaries into bibliography/scholars")
316
317
318def main():
319    conn = sqlite3.connect(DB_PATH)
320    cur = conn.cursor()
321
322    print("=== Schema Migration V2 ===\n")
323
324    # Step 1: Create new tables
325    print("1. Creating new tables...")
326    cur.executescript(MIGRATION_SQL)
327    conn.commit()
328    print("   Done (annotations, annotators, doc_folio_refs, dictionary_terms,")
329    print("         dictionary_term_links, document_topics)")
330
331    # Step 2: Add columns to existing tables
332    print("\n2. Adding review/provenance columns to existing tables...")
333    added = 0
334    for table, column, col_type, default in ALTER_COLUMNS:
335        if add_column_if_missing(cur, table, column, col_type, default):
336            print(f"   + {table}.{column}")
337            added += 1
338    conn.commit()
339    print(f"   Added {added} new columns")
340
341    # Step 3: Migrate annotator_hands → annotators
342    print("\n3. Migrating annotator_hands -> annotators...")
343    n = migrate_annotator_hands(conn)
344    print(f"   {n} annotators in normalized table")
345
346    # Step 4: Migrate dissertation_refs → doc_folio_refs
347    print("\n4. Migrating dissertation_refs -> doc_folio_refs...")
348    migrate_dissertation_refs(conn)
349    cur.execute("SELECT COUNT(*) FROM doc_folio_refs")
350    print(f"   {cur.fetchone()[0]} folio references migrated")
351
352    # Step 5: Downgrade BL confidence
353    print("\n5. Downgrading BL C.60.o.12 matches to LOW confidence...")
354    n = downgrade_bl_confidence(conn)
355    print(f"   {n} matches downgraded")
356
357    # Step 6: Import summaries.json with provenance
358    print("\n6. Importing summaries.json into DB with provenance tracking...")
359    import_summaries_to_db(conn)
360
361    # Step 7: Summary
362    print("\n=== Migration Summary ===")
363    tables = [
364        'documents', 'manuscripts', 'images', 'signature_map',
365        'dissertation_refs', 'doc_folio_refs', 'matches',
366        'annotator_hands', 'annotators', 'annotations',
367        'bibliography', 'scholars', 'scholar_works',
368        'timeline_events', 'dictionary_terms', 'dictionary_term_links',
369        'document_topics', 'schema_version'
370    ]
371    for t in tables:
372        try:
373            cur.execute(f"SELECT COUNT(*) FROM {t}")
374            print(f"  {t}: {cur.fetchone()[0]} rows")
375        except sqlite3.OperationalError:
376            print(f"  {t}: (not found)")
377
378    # Confidence distribution
379    print("\nMatch confidence distribution:")
380    cur.execute("SELECT confidence, COUNT(*) FROM matches GROUP BY confidence ORDER BY confidence")
381    for row in cur.fetchall():
382        print(f"  {row[0]}: {row[1]}")
383
384    conn.close()
385    print("\nMigration complete.")
386
387
388if __name__ == "__main__":
389    main()