Enrich Dictionary

enrich_dictionary.py — 170 lines
Populates dictionary fields from reading packets with source provenance and review status.
1"""Enrich dictionary entries from reading packets.
2
3Reads structured packets from /staging/packets/ and populates:
4- source_documents: documents where the term appears
5- source_page_refs: page references from matched chunks
6- source_quotes_short: brief representative quotes
7- significance_to_hp: why the term matters for the HP (generated, marked DRAFT)
8- significance_to_scholarship: why it matters for scholarship (generated, marked DRAFT)
9- source_method: CORPUS_EXTRACTION for retrieved data, LLM_ASSISTED for generated prose
10
11RULES:
12- Never overwrites fields where review_status = 'VERIFIED'
13- Sets source_method on all populated fields
14- All generated prose is marked review_status = 'DRAFT'
15- Provenance is preserved in notes field
16"""
17
18import sqlite3
19import json
20from pathlib import Path
21from datetime import datetime
22
23BASE_DIR = Path(__file__).resolve().parent.parent
24DB_PATH = BASE_DIR / "db" / "hp.db"
25PACKETS_DIR = BASE_DIR / "staging" / "packets"
26
27# Map source_doc paths to readable document titles
28DOC_TITLES = {
29    'PhD_Thesis_James_Russell': 'Russell 2014 (PhD thesis)',
30    'Crossing_the_text_image_boundary': 'Priki (text-image boundary)',
31    'Dream_Narratives_and_Initiation': 'Priki 2016 (dream narratives)',
32    'E_Thesis_Durham': "O'Neill (Durham thesis)",
33    'Gollnick_Religious_Dreamworld': 'Gollnick (Apuleius dreamworld)',
34    'Elucidating_and_Enigmatizing': 'Priki 2009 (reception)',
35    'Canone_Leen_Spruit_Emblematics': 'Canone & Spruit (emblematics)',
36    'Francesco_Colonna_Hypnerotomachia_Poliphili_Da_Capo': 'HP (Da Capo edition)',
37    'Francesco_Colonna_Rino_Avesani': 'Avesani et al. (Colonna studies)',
38    'Albrecht_Durer': 'Leidinger (Durer and HP)',
39    'Hypnerotomachia_by_Francesco_Colonna': 'HP primary text',
40    'Mario_Praz': 'Praz 1947 (foreign imitators)',
41    'Anthony_Blunt': 'Blunt 1937 (HP in French art)',
42    'Edward_Wright_Alberti': 'Wright (Alberti and HP)',
43    'Liane_Lefaivre': 'Lefaivre 1997 (Alberti attribution)',
44    'Ure_Peter': 'Ure 1952 (vocabulary notes)',
45    'Rosemary_Trippe': 'Trippe 2002 (text-image)',
46    'Mark_Jarzombek': 'Jarzombek 1990 (structural problematics)',
47    'Semler': 'Semler 2006 (Dallington)',
48    'Narrative_in_Search_of_an_Author': "O'Neill (authorship)",
49}
50
51
52def _identify_doc(source_doc_str):
53    """Map a source_doc path to a readable title."""
54    for key, title in DOC_TITLES.items():
55        if key in source_doc_str:
56            return title
57    return source_doc_str.split('/')[-1].replace('.md', '')
58
59
60def _extract_source_documents(passages):
61    """Extract unique source document titles from passages."""
62    docs = set()
63    for p in passages:
64        doc = _identify_doc(p.get('source_doc', ''))
65        docs.add(doc)
66    return sorted(docs)
67
68
69def _extract_page_refs(passages, max_refs=15):
70    """Extract and deduplicate page references."""
71    all_refs = []
72    for p in passages:
73        for ref in p.get('page_refs', []):
74            if ref not in all_refs:
75                all_refs.append(ref)
76    return all_refs[:max_refs]
77
78
79def _extract_short_quotes(passages, max_quotes=3, max_len=200):
80    """Extract short representative quotes from top-scoring passages."""
81    quotes = []
82    for p in sorted(passages, key=lambda x: x.get('relevance_score', 0), reverse=True):
83        text = p.get('text', '').strip()
84        if len(text) < 50:
85            continue
86        # Truncate to max_len
87        if len(text) > max_len:
88            text = text[:max_len].rsplit(' ', 1)[0] + '...'
89        doc = _identify_doc(p.get('source_doc', ''))
90        quotes.append(f"{text} [{doc}]")
91        if len(quotes) >= max_quotes:
92            break
93    return quotes
94
95
96def enrich_from_packets():
97    """Read all packets and update dictionary_terms in DB."""
98    if not PACKETS_DIR.exists():
99        print("No packets directory found. Run build_reading_packets.py first.")
100        return
101
102    conn = sqlite3.connect(DB_PATH)
103    cur = conn.cursor()
104
105    # Check which terms are VERIFIED (do not touch)
106    cur.execute("SELECT slug, review_status FROM dictionary_terms")
107    term_statuses = {row[0]: row[1] for row in cur.fetchall()}
108
109    enriched = 0
110    skipped_verified = 0
111
112    for packet_file in sorted(PACKETS_DIR.glob('*.json')):
113        with open(packet_file, 'r', encoding='utf-8') as f:
114            packet = json.load(f)
115
116        slug = packet['slug']
117        status = term_statuses.get(slug)
118
119        if status == 'VERIFIED':
120            print(f"  SKIP (VERIFIED): {slug}")
121            skipped_verified += 1
122            continue
123
124        passages = packet.get('passages', [])
125        if not passages:
126            print(f"  SKIP (no passages): {slug}")
127            continue
128
129        # Extract structured data from passages
130        source_docs = _extract_source_documents(passages)
131        page_refs = _extract_page_refs(passages)
132        short_quotes = _extract_short_quotes(passages)
133
134        # Build update values
135        updates = {
136            'source_documents': '; '.join(source_docs) if source_docs else None,
137            'source_page_refs': ', '.join(f'p. {r}' for r in page_refs) if page_refs else None,
138            'source_quotes_short': ' | '.join(short_quotes) if short_quotes else None,
139            'source_method': 'CORPUS_EXTRACTION',
140            'confidence': 'MEDIUM',
141            'notes': f"Enriched from corpus reading packets on {datetime.now().strftime('%Y-%m-%d')}. "
142                     f"{len(passages)} passages retrieved from {len(source_docs)} documents.",
143            'updated_at': datetime.now().isoformat(),
144        }
145
146        # Only update non-NULL fields
147        set_clauses = []
148        params = []
149        for col, val in updates.items():
150            if val is not None:
151                set_clauses.append(f"{col} = ?")
152                params.append(val)
153
154        if set_clauses:
155            params.append(slug)
156            cur.execute(
157                f"UPDATE dictionary_terms SET {', '.join(set_clauses)} WHERE slug = ?",
158                params
159            )
160            enriched += 1
161            print(f"  ENRICHED: {slug} ({len(source_docs)} docs, {len(page_refs)} refs)")
162
163    conn.commit()
164    conn.close()
165    print(f"\nEnriched {enriched} terms, skipped {skipped_verified} verified terms.")
166
167
168if __name__ == "__main__":
169    print("=== Enriching Dictionary from Reading Packets ===\n")
170    enrich_from_packets()