Schema Migration V2
Adds annotations, annotators, doc_folio_refs, dictionary tables, review/provenance columns. Downgrades BL confidence.
1"""Schema migration v2: Harden data model with review/provenance fields, 2normalize tables, add dictionary and annotations support. 3 4Idempotent — safe to run multiple times. 5""" 6 7import sqlite3 8import json 9from pathlib import Path 10 11BASE_DIR = Path(__file__).resolve().parent.parent 12DB_PATH = BASE_DIR / "db" / "hp.db" 13 14MIGRATION_SQL = """ 15-- ============================================================ 16-- V2 MIGRATION: Hardened schema with review/provenance fields 17-- ============================================================ 18 19-- 1. Annotations table: first-class representation of marginal notes 20CREATE TABLE IF NOT EXISTS annotations ( 21 id INTEGER PRIMARY KEY, 22 manuscript_id INTEGER REFERENCES manuscripts(id), 23 hand_id INTEGER REFERENCES annotator_hands(id), 24 signature_ref TEXT, 25 folio_number INTEGER, 26 side TEXT CHECK(side IN ('r','v',NULL)), 27 annotation_text TEXT, 28 annotation_type TEXT CHECK(annotation_type IN ( 29 'MARGINAL_NOTE','LABEL','SYMBOL','UNDERLINE','DRAWING', 30 'CROSS_REFERENCE','INDEX_ENTRY','PROVENANCE','EMENDATION','OTHER' 31 )), 32 language TEXT, 33 thesis_page INTEGER, 34 thesis_chapter INTEGER, 35 confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')) DEFAULT 'PROVISIONAL', 36 needs_review BOOLEAN DEFAULT 1, 37 reviewed BOOLEAN DEFAULT 0, 38 reviewed_by TEXT, 39 reviewed_at TEXT, 40 source_method TEXT CHECK(source_method IN ( 41 'MANUAL_TRANSCRIPTION','PDF_EXTRACTION','LLM_ASSISTED','INFERRED' 42 )) DEFAULT 'INFERRED', 43 notes TEXT 44); 45 46-- 2. Rename/normalize annotator_hands → annotators (keep old table, add view) 47CREATE TABLE IF NOT EXISTS annotators ( 48 id INTEGER PRIMARY KEY, 49 hand_label TEXT NOT NULL, 50 manuscript_id INTEGER REFERENCES manuscripts(id), 51 manuscript_shelfmark TEXT NOT NULL, 52 attribution TEXT, 53 attribution_confidence TEXT CHECK(attribution_confidence IN ('CERTAIN','PROBABLE','POSSIBLE','UNKNOWN')) DEFAULT 'POSSIBLE', 54 language TEXT, 55 ink_color TEXT, 56 date_range TEXT, 57 school TEXT, 58 interests TEXT, 59 description TEXT, 60 is_alchemist BOOLEAN DEFAULT 0, 61 chapter_num INTEGER, 62 source_method TEXT DEFAULT 'LLM_ASSISTED', 63 needs_review BOOLEAN DEFAULT 1, 64 reviewed BOOLEAN DEFAULT 0, 65 notes TEXT, 66 UNIQUE(hand_label, manuscript_shelfmark) 67); 68 69-- 3. doc_folio_refs: references from any document to specific folios 70CREATE TABLE IF NOT EXISTS doc_folio_refs ( 71 id INTEGER PRIMARY KEY, 72 document_id INTEGER REFERENCES documents(id), 73 signature_ref TEXT, 74 folio_number INTEGER, 75 side TEXT CHECK(side IN ('r','v',NULL)), 76 manuscript_shelfmark TEXT, 77 page_in_document INTEGER, 78 context_text TEXT, 79 marginal_text TEXT, 80 ref_type TEXT CHECK(ref_type IN ( 81 'MARGINALIA','ILLUSTRATION','TEXT','BINDING','PROVENANCE','CROSS_REF' 82 )), 83 chapter_num INTEGER, 84 hand_id INTEGER REFERENCES annotators(id), 85 confidence TEXT CHECK(confidence IN ('HIGH','MEDIUM','LOW','PROVISIONAL')) DEFAULT 'PROVISIONAL', 86 needs_review BOOLEAN DEFAULT 1, 87 reviewed BOOLEAN DEFAULT 0, 88 source_method TEXT DEFAULT 'PDF_EXTRACTION', 89 notes TEXT 90); 91 92-- 4. Dictionary tables 93CREATE TABLE IF NOT EXISTS dictionary_terms ( 94 id INTEGER PRIMARY KEY, 95 slug TEXT NOT NULL UNIQUE, 96 label TEXT NOT NULL, 97 category TEXT NOT NULL, 98 definition_short TEXT NOT NULL, 99 definition_long TEXT, 100 source_basis TEXT, 101 review_status TEXT CHECK(review_status IN ('DRAFT','REVIEWED','VERIFIED')) DEFAULT 'DRAFT', 102 needs_review BOOLEAN DEFAULT 1, 103 reviewed BOOLEAN DEFAULT 0, 104 created_at TEXT DEFAULT (datetime('now')), 105 updated_at TEXT DEFAULT (datetime('now')) 106); 107 108CREATE TABLE IF NOT EXISTS dictionary_term_links ( 109 id INTEGER PRIMARY KEY, 110 term_id INTEGER REFERENCES dictionary_terms(id), 111 linked_term_id INTEGER REFERENCES dictionary_terms(id), 112 link_type TEXT CHECK(link_type IN ('RELATED','SEE_ALSO','OPPOSITE','PARENT','CHILD')) DEFAULT 'RELATED', 113 UNIQUE(term_id, linked_term_id, link_type) 114); 115 116-- 5. Topic cluster junction table (multi-value support) 117CREATE TABLE IF NOT EXISTS document_topics ( 118 document_id INTEGER, 119 topic TEXT NOT NULL, 120 source_table TEXT NOT NULL CHECK(source_table IN ('bibliography','documents','summaries')), 121 source_id INTEGER NOT NULL, 122 PRIMARY KEY (source_table, source_id, topic) 123); 124 125-- 6. Add review/provenance columns to existing tables where missing 126-- (ALTER TABLE IF NOT EXISTS not supported in SQLite, so we use try/except in Python) 127 128-- 7. Update matches confidence constraint to include PROVISIONAL 129-- (SQLite can't alter constraints, so we handle this in the matching logic) 130 131-- 8. Update schema version 132INSERT OR REPLACE INTO schema_version (version, created_at) VALUES (2, datetime('now')); 133""" 134 135# Columns to add to existing tables (table, column, type, default) 136ALTER_COLUMNS = [ 137 ('matches', 'source_method', "TEXT DEFAULT 'FOLIO_EXACT'", None), 138 ('matches', 'reviewed', 'BOOLEAN DEFAULT 0', None), 139 ('matches', 'reviewed_by', 'TEXT', None), 140 ('matches', 'notes', 'TEXT', None), 141 ('documents', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None), 142 ('documents', 'source_method', "TEXT DEFAULT 'FILESYSTEM_SCAN'", None), 143 ('bibliography', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None), 144 ('bibliography', 'needs_review', 'BOOLEAN DEFAULT 1', None), 145 ('bibliography', 'reviewed', 'BOOLEAN DEFAULT 0', None), 146 ('scholars', 'review_status', "TEXT DEFAULT 'UNREVIEWED'", None), 147 ('scholars', 'needs_review', 'BOOLEAN DEFAULT 1', None), 148 ('scholars', 'reviewed', 'BOOLEAN DEFAULT 0', None), 149 ('scholars', 'source_method', "TEXT DEFAULT 'LLM_ASSISTED'", None), 150 ('images', 'confidence', "TEXT DEFAULT 'PROVISIONAL'", None), 151 ('images', 'needs_review', 'BOOLEAN DEFAULT 1', None), 152 ('timeline_events', 'needs_review', 'BOOLEAN DEFAULT 1', None), 153 ('timeline_events', 'source_method', "TEXT DEFAULT 'LLM_ASSISTED'", None), 154] 155 156 157def add_column_if_missing(cur, table, column, col_type, default): 158 """Add a column to a table if it doesn't already exist.""" 159 cur.execute(f"PRAGMA table_info({table})") 160 existing = {row[1] for row in cur.fetchall()} 161 if column not in existing: 162 sql = f"ALTER TABLE {table} ADD COLUMN {column} {col_type}" 163 cur.execute(sql) 164 return True 165 return False 166 167 168def migrate_annotator_hands(conn): 169 """Copy annotator_hands data into the new annotators table.""" 170 cur = conn.cursor() 171 cur.execute("SELECT COUNT(*) FROM annotator_hands") 172 old_count = cur.fetchone()[0] 173 if old_count == 0: 174 return 0 175 176 cur.execute("SELECT COUNT(*) FROM annotators") 177 new_count = cur.fetchone()[0] 178 if new_count > 0: 179 return new_count # Already migrated 180 181 cur.execute(""" 182 INSERT OR IGNORE INTO annotators 183 (hand_label, manuscript_shelfmark, attribution, language, 184 ink_color, date_range, school, interests, description, 185 is_alchemist, chapter_num, source_method, needs_review) 186 SELECT 187 hand_label, manuscript_shelfmark, attribution, language, 188 ink_color, date_range, school, interests, description, 189 is_alchemist, chapter_num, 'LLM_ASSISTED', 1 190 FROM annotator_hands 191 """) 192 conn.commit() 193 cur.execute("SELECT COUNT(*) FROM annotators") 194 return cur.fetchone()[0] 195 196 197def migrate_dissertation_refs(conn): 198 """Copy dissertation_refs into doc_folio_refs with proper provenance.""" 199 cur = conn.cursor() 200 201 cur.execute("SELECT COUNT(*) FROM doc_folio_refs") 202 if cur.fetchone()[0] > 0: 203 return # Already migrated 204 205 # Find the Russell thesis document ID 206 cur.execute("SELECT id FROM documents WHERE filename LIKE '%Russell%' LIMIT 1") 207 row = cur.fetchone() 208 doc_id = row[0] if row else None 209 210 cur.execute(""" 211 INSERT INTO doc_folio_refs 212 (document_id, signature_ref, manuscript_shelfmark, 213 page_in_document, context_text, marginal_text, 214 ref_type, chapter_num, hand_id, confidence, source_method) 215 SELECT 216 ?, signature_ref, manuscript_shelfmark, 217 thesis_page, context_text, marginal_text, 218 ref_type, chapter_num, hand_id, 'PROVISIONAL', 'PDF_EXTRACTION' 219 FROM dissertation_refs 220 """, (doc_id,)) 221 conn.commit() 222 223 224def downgrade_bl_confidence(conn): 225 """Downgrade all BL C.60.o.12 matches to LOW confidence. 226 227 Rationale: BL copy is 1545 edition; our signature map is based on 1499. 228 Photo numbers assumed to equal folio numbers without manual verification. 229 """ 230 cur = conn.cursor() 231 232 # Downgrade matches involving BL images 233 cur.execute(""" 234 UPDATE matches SET confidence = 'LOW', needs_review = 1 235 WHERE image_id IN ( 236 SELECT i.id FROM images i 237 JOIN manuscripts m ON i.manuscript_id = m.id 238 WHERE m.shelfmark = 'C.60.o.12' 239 ) AND confidence != 'LOW' 240 """) 241 bl_downgraded = cur.rowcount 242 243 # Also update doc_folio_refs for BL 244 cur.execute(""" 245 UPDATE doc_folio_refs SET confidence = 'PROVISIONAL' 246 WHERE manuscript_shelfmark = 'C.60.o.12' 247 """) 248 249 conn.commit() 250 return bl_downgraded 251 252 253def import_summaries_to_db(conn): 254 """Import scholars/summaries.json into bibliography and scholars tables 255 with proper provenance tracking.""" 256 summaries_path = BASE_DIR / "scholars" / "summaries.json" 257 if not summaries_path.exists(): 258 print(" No summaries.json found, skipping") 259 return 260 261 with open(summaries_path, encoding='utf-8') as f: 262 summaries = json.load(f) 263 264 cur = conn.cursor() 265 266 for s in summaries: 267 author = s.get('author', 'Unknown') 268 title = s.get('title', '') 269 year = s.get('year') 270 journal = s.get('journal', '') 271 summary = s.get('summary', '') 272 topic = s.get('topic_cluster', '') 273 filename = s.get('filename', '') 274 275 # Determine pub_type 276 if 'PhD' in journal or 'Thesis' in journal or 'E-Thesis' in journal: 277 pub_type = 'thesis' 278 elif 'Press' in journal or '(' not in journal: 279 pub_type = 'book' 280 else: 281 pub_type = 'article' 282 283 # Upsert into bibliography 284 cur.execute(""" 285 INSERT OR IGNORE INTO bibliography 286 (author, title, year, pub_type, journal_or_publisher, 287 in_collection, collection_filename, hp_relevance, 288 topic_cluster, notes, review_status, needs_review) 289 VALUES (?, ?, ?, ?, ?, 1, ?, 'DIRECT', ?, ?, 'UNREVIEWED', 1) 290 """, (author, title, str(year) if year else None, pub_type, journal, 291 filename, topic, f'LLM-generated summary: {summary[:200]}...')) 292 293 # Upsert into scholars 294 cur.execute(""" 295 INSERT OR IGNORE INTO scholars 296 (name, source_method, needs_review, review_status) 297 VALUES (?, 'LLM_ASSISTED', 1, 'UNREVIEWED') 298 """, (author,)) 299 300 # Add topic to junction table 301 if topic: 302 cur.execute("SELECT id FROM bibliography WHERE author=? AND title=?", (author, title)) 303 bib_row = cur.fetchone() 304 if bib_row: 305 for t in topic.split(','): 306 t = t.strip() 307 if t: 308 cur.execute(""" 309 INSERT OR IGNORE INTO document_topics 310 (source_table, source_id, topic) 311 VALUES ('bibliography', ?, ?) 312 """, (bib_row[0], t)) 313 314 conn.commit() 315 print(f" Imported {len(summaries)} summaries into bibliography/scholars") 316 317 318def main(): 319 conn = sqlite3.connect(DB_PATH) 320 cur = conn.cursor() 321 322 print("=== Schema Migration V2 ===\n") 323 324 # Step 1: Create new tables 325 print("1. Creating new tables...") 326 cur.executescript(MIGRATION_SQL) 327 conn.commit() 328 print(" Done (annotations, annotators, doc_folio_refs, dictionary_terms,") 329 print(" dictionary_term_links, document_topics)") 330 331 # Step 2: Add columns to existing tables 332 print("\n2. Adding review/provenance columns to existing tables...") 333 added = 0 334 for table, column, col_type, default in ALTER_COLUMNS: 335 if add_column_if_missing(cur, table, column, col_type, default): 336 print(f" + {table}.{column}") 337 added += 1 338 conn.commit() 339 print(f" Added {added} new columns") 340 341 # Step 3: Migrate annotator_hands → annotators 342 print("\n3. Migrating annotator_hands -> annotators...") 343 n = migrate_annotator_hands(conn) 344 print(f" {n} annotators in normalized table") 345 346 # Step 4: Migrate dissertation_refs → doc_folio_refs 347 print("\n4. Migrating dissertation_refs -> doc_folio_refs...") 348 migrate_dissertation_refs(conn) 349 cur.execute("SELECT COUNT(*) FROM doc_folio_refs") 350 print(f" {cur.fetchone()[0]} folio references migrated") 351 352 # Step 5: Downgrade BL confidence 353 print("\n5. Downgrading BL C.60.o.12 matches to LOW confidence...") 354 n = downgrade_bl_confidence(conn) 355 print(f" {n} matches downgraded") 356 357 # Step 6: Import summaries.json with provenance 358 print("\n6. Importing summaries.json into DB with provenance tracking...") 359 import_summaries_to_db(conn) 360 361 # Step 7: Summary 362 print("\n=== Migration Summary ===") 363 tables = [ 364 'documents', 'manuscripts', 'images', 'signature_map', 365 'dissertation_refs', 'doc_folio_refs', 'matches', 366 'annotator_hands', 'annotators', 'annotations', 367 'bibliography', 'scholars', 'scholar_works', 368 'timeline_events', 'dictionary_terms', 'dictionary_term_links', 369 'document_topics', 'schema_version' 370 ] 371 for t in tables: 372 try: 373 cur.execute(f"SELECT COUNT(*) FROM {t}") 374 print(f" {t}: {cur.fetchone()[0]} rows") 375 except sqlite3.OperationalError: 376 print(f" {t}: (not found)") 377 378 # Confidence distribution 379 print("\nMatch confidence distribution:") 380 cur.execute("SELECT confidence, COUNT(*) FROM matches GROUP BY confidence ORDER BY confidence") 381 for row in cur.fetchall(): 382 print(f" {row[0]}: {row[1]}") 383 384 conn.close() 385 print("\nMigration complete.") 386 387 388if __name__ == "__main__": 389 main()