Link Scholars
Links scholars to bibliography, tags historical figures, matches summaries.json to bibliography entries.
1"""Link scholars to bibliography entries and summaries.json. 2 3Step 1 of the Scholar Pipeline (docs/SCHOLAR_PIPELINE.md). 4 5This script: 61. Adds missing columns (scholar_overview, is_historical_subject, has_summary, summary_source) 72. Tags historical figures 83. Matches summaries.json -> bibliography by author+title 94. Sets has_summary flags on scholar_works 105. Logs unmatched entries 11 12Idempotent and non-destructive. 13""" 14 15import sqlite3 16import json 17import re 18import unicodedata 19from pathlib import Path 20 21BASE_DIR = Path(__file__).resolve().parent.parent 22DB_PATH = BASE_DIR / "db" / "hp.db" 23SUMMARIES_PATH = BASE_DIR / "scholars" / "summaries.json" 24STAGING_DIR = BASE_DIR / "staging" / "scholar" 25 26# Historical figures: HP subjects, not modern scholars 27HISTORICAL_FIGURES = [ 28 "Francesco Colonna", 29 "Aldus Manutius", 30 "Ben Jonson", 31 "Fabio Chigi (Pope Alexander VII)", 32 "Benedetto Giovio", 33 "Paolo Giovio", 34 "Jean Martin", 35 "Beroalde de Verville", 36 "Charles Nodier", 37 "Pope Alexander VII", 38] 39 40 41def normalize_name(name): 42 """Normalize a name for comparison: lowercase, strip periods, collapse spaces.""" 43 if not name: 44 return '' 45 s = name.lower().strip() 46 # Remove accents for comparison 47 s = unicodedata.normalize('NFKD', s) 48 s = ''.join(c for c in s if not unicodedata.combining(c)) 49 # Strip periods 50 s = s.replace('.', '') 51 # Collapse whitespace 52 s = re.sub(r'\s+', ' ', s) 53 return s 54 55 56def normalize_title(title): 57 """Normalize a title for comparison.""" 58 if not title: 59 return '' 60 s = title.lower().strip() 61 # Remove common prefixes 62 s = re.sub(r'^(the|a|an)\s+', '', s) 63 # Remove punctuation 64 s = re.sub(r'[^\w\s]', '', s) 65 s = re.sub(r'\s+', ' ', s) 66 return s[:80] # First 80 chars for matching 67 68 69def main(): 70 conn = sqlite3.connect(DB_PATH) 71 cur = conn.cursor() 72 73 print("=== Scholar Linking Pipeline ===\n") 74 75 # Step 1: Add missing columns 76 print("Step 1: Adding missing columns...") 77 existing_scholar_cols = {r[1] for r in cur.execute("PRAGMA table_info(scholars)")} 78 existing_sw_cols = {r[1] for r in cur.execute("PRAGMA table_info(scholar_works)")} 79 80 if 'scholar_overview' not in existing_scholar_cols: 81 cur.execute("ALTER TABLE scholars ADD COLUMN scholar_overview TEXT") 82 print(" Added scholars.scholar_overview") 83 if 'is_historical_subject' not in existing_scholar_cols: 84 cur.execute("ALTER TABLE scholars ADD COLUMN is_historical_subject BOOLEAN DEFAULT 0") 85 print(" Added scholars.is_historical_subject") 86 if 'has_summary' not in existing_sw_cols: 87 cur.execute("ALTER TABLE scholar_works ADD COLUMN has_summary BOOLEAN DEFAULT 0") 88 print(" Added scholar_works.has_summary") 89 if 'summary_source' not in existing_sw_cols: 90 cur.execute("ALTER TABLE scholar_works ADD COLUMN summary_source TEXT") 91 print(" Added scholar_works.summary_source") 92 conn.commit() 93 94 # Step 2: Tag historical figures 95 print("\nStep 2: Tagging historical figures...") 96 tagged = 0 97 for hist_name in HISTORICAL_FIGURES: 98 norm = normalize_name(hist_name) 99 cur.execute("SELECT id, name FROM scholars") 100 for sid, sname in cur.fetchall(): 101 if normalize_name(sname) == norm or norm in normalize_name(sname): 102 cur.execute("UPDATE scholars SET is_historical_subject = 1 WHERE id = ?", (sid,)) 103 if cur.rowcount: 104 tagged += 1 105 print(f" Tagged: {sname}") 106 conn.commit() 107 print(f" {tagged} historical figures tagged") 108 109 # Step 3: Load summaries and match to bibliography 110 print("\nStep 3: Matching summaries.json to bibliography...") 111 summaries = [] 112 if SUMMARIES_PATH.exists(): 113 with open(SUMMARIES_PATH, encoding='utf-8') as f: 114 summaries = json.load(f) 115 116 # Get all bibliography entries 117 cur.execute("SELECT id, author, title, year FROM bibliography") 118 bib_entries = cur.fetchall() 119 120 # Build normalized lookup 121 bib_lookup = {} 122 for bid, bauthor, btitle, byear in bib_entries: 123 key = (normalize_name(bauthor), normalize_title(btitle)) 124 bib_lookup[key] = bid 125 126 matched_summaries = 0 127 unmatched_summaries = [] 128 129 for s in summaries: 130 s_author = s.get('author', '') 131 s_title = s.get('title', '') 132 s_key = (normalize_name(s_author), normalize_title(s_title)) 133 134 # Try exact match first 135 bib_id = bib_lookup.get(s_key) 136 137 # Try partial title match if no exact 138 if not bib_id: 139 norm_author = normalize_name(s_author) 140 norm_title = normalize_title(s_title) 141 for (ba, bt), bid in bib_lookup.items(): 142 if norm_author == ba and (norm_title[:30] in bt or bt[:30] in norm_title): 143 bib_id = bid 144 break 145 146 if bib_id: 147 matched_summaries += 1 148 # Find the scholar for this author 149 cur.execute("SELECT id FROM scholars WHERE LOWER(name) = LOWER(?)", (s_author,)) 150 scholar = cur.fetchone() 151 if scholar: 152 # Update scholar_works with has_summary 153 cur.execute(""" 154 UPDATE scholar_works SET has_summary = 1, summary_source = 'summaries.json' 155 WHERE scholar_id = ? AND bib_id = ? 156 """, (scholar[0], bib_id)) 157 if cur.rowcount == 0: 158 # Link might not exist, create it 159 cur.execute(""" 160 INSERT OR IGNORE INTO scholar_works (scholar_id, bib_id, has_summary, summary_source) 161 VALUES (?, ?, 1, 'summaries.json') 162 """, (scholar[0], bib_id)) 163 else: 164 unmatched_summaries.append({ 165 'author': s_author, 166 'title': s_title, 167 'reason': 'no bibliography match found', 168 }) 169 170 conn.commit() 171 print(f" Matched: {matched_summaries}/{len(summaries)} summaries to bibliography") 172 print(f" Unmatched: {len(unmatched_summaries)}") 173 174 # Step 4: Log unmatched 175 STAGING_DIR.mkdir(parents=True, exist_ok=True) 176 unmatched_path = STAGING_DIR / "unmatched.json" 177 with open(unmatched_path, 'w', encoding='utf-8') as f: 178 json.dump({ 179 'unmatched_summaries': unmatched_summaries, 180 'summary_match_rate': f"{matched_summaries}/{len(summaries)}", 181 }, f, indent=2, ensure_ascii=False) 182 print(f"\n Unmatched log: {unmatched_path}") 183 184 # Step 5: Report 185 cur.execute("SELECT COUNT(*) FROM scholars WHERE is_historical_subject = 1") 186 hist_count = cur.fetchone()[0] 187 cur.execute("SELECT COUNT(*) FROM scholars WHERE is_historical_subject = 0 OR is_historical_subject IS NULL") 188 modern_count = cur.fetchone()[0] 189 cur.execute("SELECT COUNT(*) FROM scholar_works WHERE has_summary = 1") 190 with_summary = cur.fetchone()[0] 191 cur.execute("SELECT COUNT(*) FROM scholar_works") 192 total_links = cur.fetchone()[0] 193 194 print(f"\n=== Summary ===") 195 print(f" Historical figures: {hist_count}") 196 print(f" Modern scholars: {modern_count}") 197 print(f" Scholar-work links: {total_links}") 198 print(f" Links with summaries: {with_summary}") 199 print(f" Summary match rate: {matched_summaries}/{len(summaries)}") 200 201 conn.close() 202 203 204if __name__ == "__main__": 205 main()