Build Reading Packets
Assembles structured research packets from corpus search for dictionary enrichment.
1"""Build structured reading packets for dictionary term enrichment. 2 3For each dictionary term (or a specified subset), this script: 41. Searches the chunk corpus for relevant passages 52. Assembles a structured packet with full provenance 63. Writes packets to /staging/packets/[slug].json 7 8Packets contain ONLY retrieved evidence. No generated interpretations. 9Downstream enrichment scripts use packets as input. 10""" 11 12import sqlite3 13import json 14from pathlib import Path 15 16BASE_DIR = Path(__file__).resolve().parent.parent 17DB_PATH = BASE_DIR / "db" / "hp.db" 18STAGING_DIR = BASE_DIR / "staging" 19PACKETS_DIR = STAGING_DIR / "packets" 20 21# Import corpus search 22import sys 23sys.path.insert(0, str(BASE_DIR / "scripts")) 24from corpus_search import search_by_term, search_chunks 25 26 27# Synonyms / alternate forms for better search coverage 28TERM_SYNONYMS = { 29 'signature': ['signature mark', 'sig.', 'quire mark'], 30 'quire': ['gathering', 'quaternion'], 31 'folio': ['leaf', 'fol.'], 32 'marginalia': ['marginal note', 'annotation', 'margin'], 33 'annotator-hand': ['hand A', 'hand B', 'hand C', 'hand D', 'hand E', 34 'annotator', 'handwriting'], 35 'alchemical-allegory': ['alchemical reading', 'alchemical interpretation', 36 'alchemist'], 37 'master-mercury': ['mercury', 'Mercurii', 'quicksilver', "d'Espagnet"], 38 'sol-luna': ['Sol and Luna', 'sun and moon', 'gold and silver'], 39 'chemical-wedding': ['chemical marriage', 'chymische Hochzeit', 40 'hermaphrodite'], 41 'prisca-sapientia': ['ancient wisdom', 'prisca theologia', 42 'Hermes Trismegistus'], 43 'woodcut': ['illustration', 'woodblock', 'woodcuts'], 44 'acrostic': ['POLIAM FRATER', 'chapter initials'], 45 'hieroglyph': ['hieroglyphic', 'Horapollo', 'pseudo-Egyptian'], 46 'emblem': ['emblem book', 'Alciato', 'pictura'], 47 'ekphrasis': ['ekphrastic', 'verbal description'], 48 'incunabulum': ['incunabula', 'ISTC', 'fifteenth-century printing'], 49 'aldus-manutius': ['Aldus', 'Manutius', 'Aldine'], 50 'authorship-debate': ['Francesco Colonna', 'Alberti', 'authorship'], 51 'dream-narrative': ['dream', 'Poliphilo falls asleep', 'dream-within'], 52 'elephant-obelisk': ['elephant', 'obelisk', 'Bernini', 'b6v', 'b7r'], 53 'ideogram': ['alchemical symbol', 'alchemical sign', 'ideogram'], 54 'activity-book': ['activity book', 'humanistic activity'], 55 'inventio': ['invention', 'rhetorical invention'], 56 'ingegno': ['ingenium', 'wit', 'ingegno'], 57 'acutezze': ['acutezza', 'wit', 'Alexander VII', 'Chigi'], 58 'cythera': ['Cythera', 'island of Venus', 'circular garden'], 59 'reception-history': ['reception', 'readership', 'readers'], 60 'antiquarianism': ['antiquarian', 'Cyriacus', 'ancient monuments'], 61 'vernacular-poetics': ['Petrarchan', 'vernacular', 'Italian poetry'], 62 'collation': ['collation formula', 'a-z8', 'bibliographic structure'], 63 'apparatus': ['critical edition', 'textual notes', 'apparatus criticus'], 64 'commentary': ['commentator', 'gloss', 'interpretation'], 65 'allegory': ['allegorical', 'allegory of love'], 66 'architectural-body': ['architectural body', 'Lefaivre', 'embodied'], 67 'recto': ['recto'], 68 'verso': ['verso'], 69 'gathering': ['gathering', 'quaternion'], 70} 71 72 73def build_packet(term_slug, term_label, category, current_status): 74 """Build a reading packet for a single dictionary term. 75 76 Returns a structured dict with retrieved evidence only. 77 """ 78 synonyms = TERM_SYNONYMS.get(term_slug, []) 79 80 # Search using term label + synonyms 81 results = search_by_term(term_label, synonyms=synonyms) 82 83 passages = [] 84 for r in results: 85 passages.append({ 86 'text': r['matched_text'], 87 'source_doc': r['source_doc'], 88 'chunk_path': r['chunk_path'], 89 'section': r['section'], 90 'page_refs': r['page_refs'], 91 'relevance_score': r['relevance_score'], 92 }) 93 94 return { 95 'term': term_label, 96 'slug': term_slug, 97 'category': category, 98 'current_review_status': current_status, 99 'passage_count': len(passages), 100 'passages': passages, 101 'search_terms_used': [term_label] + synonyms, 102 'source_method': 'CORPUS_EXTRACTION', 103 } 104 105 106def build_all_packets(filter_status=None, filter_slugs=None): 107 """Build reading packets for all (or filtered) dictionary terms. 108 109 Args: 110 filter_status: Only build for terms with this review_status (e.g. 'DRAFT') 111 filter_slugs: Only build for these specific slugs 112 """ 113 conn = sqlite3.connect(DB_PATH) 114 cur = conn.cursor() 115 116 query = "SELECT slug, label, category, review_status FROM dictionary_terms" 117 params = [] 118 if filter_status: 119 query += " WHERE review_status = ?" 120 params.append(filter_status) 121 query += " ORDER BY slug" 122 123 cur.execute(query, params) 124 terms = cur.fetchall() 125 conn.close() 126 127 PACKETS_DIR.mkdir(parents=True, exist_ok=True) 128 129 built = 0 130 for slug, label, category, status in terms: 131 if filter_slugs and slug not in filter_slugs: 132 continue 133 134 print(f" Building packet: {slug} ({category})") 135 packet = build_packet(slug, label, category, status) 136 137 packet_path = PACKETS_DIR / f"{slug}.json" 138 with open(packet_path, 'w', encoding='utf-8') as f: 139 json.dump(packet, f, indent=2, ensure_ascii=False) 140 141 built += 1 142 print(f" -> {packet['passage_count']} passages found") 143 144 print(f"\nBuilt {built} reading packets in {PACKETS_DIR}") 145 return built 146 147 148if __name__ == "__main__": 149 import sys 150 print("=== Building Reading Packets ===\n") 151 152 if len(sys.argv) > 1: 153 # Build for specific slugs 154 slugs = sys.argv[1:] 155 build_all_packets(filter_slugs=slugs) 156 else: 157 # Build for all DRAFT terms 158 build_all_packets(filter_status='DRAFT')