Link Scholars

link_scholars.py — 205 lines
Links scholars to bibliography, tags historical figures, matches summaries.json to bibliography entries.
1"""Link scholars to bibliography entries and summaries.json.
2
3Step 1 of the Scholar Pipeline (docs/SCHOLAR_PIPELINE.md).
4
5This script:
61. Adds missing columns (scholar_overview, is_historical_subject, has_summary, summary_source)
72. Tags historical figures
83. Matches summaries.json -> bibliography by author+title
94. Sets has_summary flags on scholar_works
105. Logs unmatched entries
11
12Idempotent and non-destructive.
13"""
14
15import sqlite3
16import json
17import re
18import unicodedata
19from pathlib import Path
20
21BASE_DIR = Path(__file__).resolve().parent.parent
22DB_PATH = BASE_DIR / "db" / "hp.db"
23SUMMARIES_PATH = BASE_DIR / "scholars" / "summaries.json"
24STAGING_DIR = BASE_DIR / "staging" / "scholar"
25
26# Historical figures: HP subjects, not modern scholars
27HISTORICAL_FIGURES = [
28    "Francesco Colonna",
29    "Aldus Manutius",
30    "Ben Jonson",
31    "Fabio Chigi (Pope Alexander VII)",
32    "Benedetto Giovio",
33    "Paolo Giovio",
34    "Jean Martin",
35    "Beroalde de Verville",
36    "Charles Nodier",
37    "Pope Alexander VII",
38]
39
40
41def normalize_name(name):
42    """Normalize a name for comparison: lowercase, strip periods, collapse spaces."""
43    if not name:
44        return ''
45    s = name.lower().strip()
46    # Remove accents for comparison
47    s = unicodedata.normalize('NFKD', s)
48    s = ''.join(c for c in s if not unicodedata.combining(c))
49    # Strip periods
50    s = s.replace('.', '')
51    # Collapse whitespace
52    s = re.sub(r'\s+', ' ', s)
53    return s
54
55
56def normalize_title(title):
57    """Normalize a title for comparison."""
58    if not title:
59        return ''
60    s = title.lower().strip()
61    # Remove common prefixes
62    s = re.sub(r'^(the|a|an)\s+', '', s)
63    # Remove punctuation
64    s = re.sub(r'[^\w\s]', '', s)
65    s = re.sub(r'\s+', ' ', s)
66    return s[:80]  # First 80 chars for matching
67
68
69def main():
70    conn = sqlite3.connect(DB_PATH)
71    cur = conn.cursor()
72
73    print("=== Scholar Linking Pipeline ===\n")
74
75    # Step 1: Add missing columns
76    print("Step 1: Adding missing columns...")
77    existing_scholar_cols = {r[1] for r in cur.execute("PRAGMA table_info(scholars)")}
78    existing_sw_cols = {r[1] for r in cur.execute("PRAGMA table_info(scholar_works)")}
79
80    if 'scholar_overview' not in existing_scholar_cols:
81        cur.execute("ALTER TABLE scholars ADD COLUMN scholar_overview TEXT")
82        print("  Added scholars.scholar_overview")
83    if 'is_historical_subject' not in existing_scholar_cols:
84        cur.execute("ALTER TABLE scholars ADD COLUMN is_historical_subject BOOLEAN DEFAULT 0")
85        print("  Added scholars.is_historical_subject")
86    if 'has_summary' not in existing_sw_cols:
87        cur.execute("ALTER TABLE scholar_works ADD COLUMN has_summary BOOLEAN DEFAULT 0")
88        print("  Added scholar_works.has_summary")
89    if 'summary_source' not in existing_sw_cols:
90        cur.execute("ALTER TABLE scholar_works ADD COLUMN summary_source TEXT")
91        print("  Added scholar_works.summary_source")
92    conn.commit()
93
94    # Step 2: Tag historical figures
95    print("\nStep 2: Tagging historical figures...")
96    tagged = 0
97    for hist_name in HISTORICAL_FIGURES:
98        norm = normalize_name(hist_name)
99        cur.execute("SELECT id, name FROM scholars")
100        for sid, sname in cur.fetchall():
101            if normalize_name(sname) == norm or norm in normalize_name(sname):
102                cur.execute("UPDATE scholars SET is_historical_subject = 1 WHERE id = ?", (sid,))
103                if cur.rowcount:
104                    tagged += 1
105                    print(f"  Tagged: {sname}")
106    conn.commit()
107    print(f"  {tagged} historical figures tagged")
108
109    # Step 3: Load summaries and match to bibliography
110    print("\nStep 3: Matching summaries.json to bibliography...")
111    summaries = []
112    if SUMMARIES_PATH.exists():
113        with open(SUMMARIES_PATH, encoding='utf-8') as f:
114            summaries = json.load(f)
115
116    # Get all bibliography entries
117    cur.execute("SELECT id, author, title, year FROM bibliography")
118    bib_entries = cur.fetchall()
119
120    # Build normalized lookup
121    bib_lookup = {}
122    for bid, bauthor, btitle, byear in bib_entries:
123        key = (normalize_name(bauthor), normalize_title(btitle))
124        bib_lookup[key] = bid
125
126    matched_summaries = 0
127    unmatched_summaries = []
128
129    for s in summaries:
130        s_author = s.get('author', '')
131        s_title = s.get('title', '')
132        s_key = (normalize_name(s_author), normalize_title(s_title))
133
134        # Try exact match first
135        bib_id = bib_lookup.get(s_key)
136
137        # Try partial title match if no exact
138        if not bib_id:
139            norm_author = normalize_name(s_author)
140            norm_title = normalize_title(s_title)
141            for (ba, bt), bid in bib_lookup.items():
142                if norm_author == ba and (norm_title[:30] in bt or bt[:30] in norm_title):
143                    bib_id = bid
144                    break
145
146        if bib_id:
147            matched_summaries += 1
148            # Find the scholar for this author
149            cur.execute("SELECT id FROM scholars WHERE LOWER(name) = LOWER(?)", (s_author,))
150            scholar = cur.fetchone()
151            if scholar:
152                # Update scholar_works with has_summary
153                cur.execute("""
154                    UPDATE scholar_works SET has_summary = 1, summary_source = 'summaries.json'
155                    WHERE scholar_id = ? AND bib_id = ?
156                """, (scholar[0], bib_id))
157                if cur.rowcount == 0:
158                    # Link might not exist, create it
159                    cur.execute("""
160                        INSERT OR IGNORE INTO scholar_works (scholar_id, bib_id, has_summary, summary_source)
161                        VALUES (?, ?, 1, 'summaries.json')
162                    """, (scholar[0], bib_id))
163        else:
164            unmatched_summaries.append({
165                'author': s_author,
166                'title': s_title,
167                'reason': 'no bibliography match found',
168            })
169
170    conn.commit()
171    print(f"  Matched: {matched_summaries}/{len(summaries)} summaries to bibliography")
172    print(f"  Unmatched: {len(unmatched_summaries)}")
173
174    # Step 4: Log unmatched
175    STAGING_DIR.mkdir(parents=True, exist_ok=True)
176    unmatched_path = STAGING_DIR / "unmatched.json"
177    with open(unmatched_path, 'w', encoding='utf-8') as f:
178        json.dump({
179            'unmatched_summaries': unmatched_summaries,
180            'summary_match_rate': f"{matched_summaries}/{len(summaries)}",
181        }, f, indent=2, ensure_ascii=False)
182    print(f"\n  Unmatched log: {unmatched_path}")
183
184    # Step 5: Report
185    cur.execute("SELECT COUNT(*) FROM scholars WHERE is_historical_subject = 1")
186    hist_count = cur.fetchone()[0]
187    cur.execute("SELECT COUNT(*) FROM scholars WHERE is_historical_subject = 0 OR is_historical_subject IS NULL")
188    modern_count = cur.fetchone()[0]
189    cur.execute("SELECT COUNT(*) FROM scholar_works WHERE has_summary = 1")
190    with_summary = cur.fetchone()[0]
191    cur.execute("SELECT COUNT(*) FROM scholar_works")
192    total_links = cur.fetchone()[0]
193
194    print(f"\n=== Summary ===")
195    print(f"  Historical figures: {hist_count}")
196    print(f"  Modern scholars: {modern_count}")
197    print(f"  Scholar-work links: {total_links}")
198    print(f"  Links with summaries: {with_summary}")
199    print(f"  Summary match rate: {matched_summaries}/{len(summaries)}")
200
201    conn.close()
202
203
204if __name__ == "__main__":
205    main()