Enrich Dictionary
Populates dictionary fields from reading packets with source provenance and review status.
1"""Enrich dictionary entries from reading packets. 2 3Reads structured packets from /staging/packets/ and populates: 4- source_documents: documents where the term appears 5- source_page_refs: page references from matched chunks 6- source_quotes_short: brief representative quotes 7- significance_to_hp: why the term matters for the HP (generated, marked DRAFT) 8- significance_to_scholarship: why it matters for scholarship (generated, marked DRAFT) 9- source_method: CORPUS_EXTRACTION for retrieved data, LLM_ASSISTED for generated prose 10 11RULES: 12- Never overwrites fields where review_status = 'VERIFIED' 13- Sets source_method on all populated fields 14- All generated prose is marked review_status = 'DRAFT' 15- Provenance is preserved in notes field 16""" 17 18import sqlite3 19import json 20from pathlib import Path 21from datetime import datetime 22 23BASE_DIR = Path(__file__).resolve().parent.parent 24DB_PATH = BASE_DIR / "db" / "hp.db" 25PACKETS_DIR = BASE_DIR / "staging" / "packets" 26 27# Map source_doc paths to readable document titles 28DOC_TITLES = { 29 'PhD_Thesis_James_Russell': 'Russell 2014 (PhD thesis)', 30 'Crossing_the_text_image_boundary': 'Priki (text-image boundary)', 31 'Dream_Narratives_and_Initiation': 'Priki 2016 (dream narratives)', 32 'E_Thesis_Durham': "O'Neill (Durham thesis)", 33 'Gollnick_Religious_Dreamworld': 'Gollnick (Apuleius dreamworld)', 34 'Elucidating_and_Enigmatizing': 'Priki 2009 (reception)', 35 'Canone_Leen_Spruit_Emblematics': 'Canone & Spruit (emblematics)', 36 'Francesco_Colonna_Hypnerotomachia_Poliphili_Da_Capo': 'HP (Da Capo edition)', 37 'Francesco_Colonna_Rino_Avesani': 'Avesani et al. (Colonna studies)', 38 'Albrecht_Durer': 'Leidinger (Durer and HP)', 39 'Hypnerotomachia_by_Francesco_Colonna': 'HP primary text', 40 'Mario_Praz': 'Praz 1947 (foreign imitators)', 41 'Anthony_Blunt': 'Blunt 1937 (HP in French art)', 42 'Edward_Wright_Alberti': 'Wright (Alberti and HP)', 43 'Liane_Lefaivre': 'Lefaivre 1997 (Alberti attribution)', 44 'Ure_Peter': 'Ure 1952 (vocabulary notes)', 45 'Rosemary_Trippe': 'Trippe 2002 (text-image)', 46 'Mark_Jarzombek': 'Jarzombek 1990 (structural problematics)', 47 'Semler': 'Semler 2006 (Dallington)', 48 'Narrative_in_Search_of_an_Author': "O'Neill (authorship)", 49} 50 51 52def _identify_doc(source_doc_str): 53 """Map a source_doc path to a readable title.""" 54 for key, title in DOC_TITLES.items(): 55 if key in source_doc_str: 56 return title 57 return source_doc_str.split('/')[-1].replace('.md', '') 58 59 60def _extract_source_documents(passages): 61 """Extract unique source document titles from passages.""" 62 docs = set() 63 for p in passages: 64 doc = _identify_doc(p.get('source_doc', '')) 65 docs.add(doc) 66 return sorted(docs) 67 68 69def _extract_page_refs(passages, max_refs=15): 70 """Extract and deduplicate page references.""" 71 all_refs = [] 72 for p in passages: 73 for ref in p.get('page_refs', []): 74 if ref not in all_refs: 75 all_refs.append(ref) 76 return all_refs[:max_refs] 77 78 79def _extract_short_quotes(passages, max_quotes=3, max_len=200): 80 """Extract short representative quotes from top-scoring passages.""" 81 quotes = [] 82 for p in sorted(passages, key=lambda x: x.get('relevance_score', 0), reverse=True): 83 text = p.get('text', '').strip() 84 if len(text) < 50: 85 continue 86 # Truncate to max_len 87 if len(text) > max_len: 88 text = text[:max_len].rsplit(' ', 1)[0] + '...' 89 doc = _identify_doc(p.get('source_doc', '')) 90 quotes.append(f"{text} [{doc}]") 91 if len(quotes) >= max_quotes: 92 break 93 return quotes 94 95 96def enrich_from_packets(): 97 """Read all packets and update dictionary_terms in DB.""" 98 if not PACKETS_DIR.exists(): 99 print("No packets directory found. Run build_reading_packets.py first.") 100 return 101 102 conn = sqlite3.connect(DB_PATH) 103 cur = conn.cursor() 104 105 # Check which terms are VERIFIED (do not touch) 106 cur.execute("SELECT slug, review_status FROM dictionary_terms") 107 term_statuses = {row[0]: row[1] for row in cur.fetchall()} 108 109 enriched = 0 110 skipped_verified = 0 111 112 for packet_file in sorted(PACKETS_DIR.glob('*.json')): 113 with open(packet_file, 'r', encoding='utf-8') as f: 114 packet = json.load(f) 115 116 slug = packet['slug'] 117 status = term_statuses.get(slug) 118 119 if status == 'VERIFIED': 120 print(f" SKIP (VERIFIED): {slug}") 121 skipped_verified += 1 122 continue 123 124 passages = packet.get('passages', []) 125 if not passages: 126 print(f" SKIP (no passages): {slug}") 127 continue 128 129 # Extract structured data from passages 130 source_docs = _extract_source_documents(passages) 131 page_refs = _extract_page_refs(passages) 132 short_quotes = _extract_short_quotes(passages) 133 134 # Build update values 135 updates = { 136 'source_documents': '; '.join(source_docs) if source_docs else None, 137 'source_page_refs': ', '.join(f'p. {r}' for r in page_refs) if page_refs else None, 138 'source_quotes_short': ' | '.join(short_quotes) if short_quotes else None, 139 'source_method': 'CORPUS_EXTRACTION', 140 'confidence': 'MEDIUM', 141 'notes': f"Enriched from corpus reading packets on {datetime.now().strftime('%Y-%m-%d')}. " 142 f"{len(passages)} passages retrieved from {len(source_docs)} documents.", 143 'updated_at': datetime.now().isoformat(), 144 } 145 146 # Only update non-NULL fields 147 set_clauses = [] 148 params = [] 149 for col, val in updates.items(): 150 if val is not None: 151 set_clauses.append(f"{col} = ?") 152 params.append(val) 153 154 if set_clauses: 155 params.append(slug) 156 cur.execute( 157 f"UPDATE dictionary_terms SET {', '.join(set_clauses)} WHERE slug = ?", 158 params 159 ) 160 enriched += 1 161 print(f" ENRICHED: {slug} ({len(source_docs)} docs, {len(page_refs)} refs)") 162 163 conn.commit() 164 conn.close() 165 print(f"\nEnriched {enriched} terms, skipped {skipped_verified} verified terms.") 166 167 168if __name__ == "__main__": 169 print("=== Enriching Dictionary from Reading Packets ===\n") 170 enrich_from_packets()