Corpus Search

corpus_search.py — 220 lines
Keyword-based search across markdown chunks and documents with provenance tracking.
1"""Corpus search utilities: search across markdown chunks and full documents.
2
3Provides keyword-based search across the /chunks/ and /md/ directories
4with provenance tracking. No embeddings required.
5
6Functions:
7    search_chunks(query, top_n=20) -> list of match dicts
8    search_by_term(term_slug, synonyms=None) -> list of match dicts
9    search_documents(query, doc_filter=None) -> list of match dicts
10"""
11
12import re
13import os
14from pathlib import Path
15
16BASE_DIR = Path(__file__).resolve().parent.parent
17CHUNKS_DIR = BASE_DIR / "chunks"
18MD_DIR = BASE_DIR / "md"
19
20
21def _parse_chunk_frontmatter(text):
22    """Extract YAML frontmatter from a chunk file."""
23    meta = {}
24    if text.startswith('---'):
25        end = text.find('---', 3)
26        if end > 0:
27            for line in text[3:end].strip().split('\n'):
28                if ':' in line:
29                    key, val = line.split(':', 1)
30                    val = val.strip().strip('"').strip("'")
31                    meta[key.strip()] = val
32    return meta
33
34
35def _extract_page_refs(text):
36    """Extract page markers from chunk text."""
37    return re.findall(r'<!-- Page (\d+) -->', text)
38
39
40def _score_match(text, query_terms):
41    """Score a text block by frequency and proximity of query terms."""
42    text_lower = text.lower()
43    score = 0
44    for term in query_terms:
45        count = text_lower.count(term.lower())
46        score += count
47    return score
48
49
50def _context_window(text, query_terms, window=300):
51    """Extract the best context window around the query terms."""
52    text_lower = text.lower()
53    best_pos = -1
54    best_score = 0
55
56    for term in query_terms:
57        pos = text_lower.find(term.lower())
58        if pos >= 0:
59            # Score this position by counting nearby term occurrences
60            start = max(0, pos - window)
61            end = min(len(text), pos + window)
62            snippet = text_lower[start:end]
63            score = sum(snippet.count(t.lower()) for t in query_terms)
64            if score > best_score:
65                best_score = score
66                best_pos = pos
67
68    if best_pos < 0:
69        return text[:window * 2] if len(text) > window * 2 else text
70
71    start = max(0, best_pos - window)
72    end = min(len(text), best_pos + window)
73    snippet = text[start:end].strip()
74
75    # Clean up snippet boundaries
76    if start > 0:
77        first_space = snippet.find(' ')
78        if first_space > 0:
79            snippet = '...' + snippet[first_space:]
80    if end < len(text):
81        last_space = snippet.rfind(' ')
82        if last_space > 0:
83            snippet = snippet[:last_space] + '...'
84
85    return snippet
86
87
88def search_chunks(query, top_n=20):
89    """Search across all chunk files for passages matching query.
90
91    Args:
92        query: Search string (can be multiple words)
93        top_n: Maximum results to return
94
95    Returns:
96        List of dicts: {chunk_path, source_doc, section, page_refs,
97                        matched_text, relevance_score, word_count}
98    """
99    if not CHUNKS_DIR.exists():
100        return []
101
102    query_terms = [t for t in query.lower().split() if len(t) > 2]
103    if not query_terms:
104        return []
105
106    results = []
107
108    for doc_dir in sorted(CHUNKS_DIR.iterdir()):
109        if not doc_dir.is_dir():
110            continue
111        for chunk_file in sorted(doc_dir.glob('chunk_*.md')):
112            text = chunk_file.read_text(encoding='utf-8', errors='replace')
113            meta = _parse_chunk_frontmatter(text)
114
115            # Strip frontmatter for content search
116            body_start = text.find('---', 3)
117            body = text[body_start + 3:].strip() if body_start > 0 else text
118
119            score = _score_match(body, query_terms)
120            if score == 0:
121                continue
122
123            page_refs = _extract_page_refs(body)
124            context = _context_window(body, query_terms)
125
126            results.append({
127                'chunk_path': str(chunk_file.relative_to(BASE_DIR)),
128                'source_doc': meta.get('source', str(doc_dir.name)),
129                'section': meta.get('section', ''),
130                'page_refs': page_refs,
131                'matched_text': context,
132                'relevance_score': score,
133                'word_count': int(meta.get('word_count', 0)),
134            })
135
136    results.sort(key=lambda x: x['relevance_score'], reverse=True)
137    return results[:top_n]
138
139
140def search_by_term(term_label, synonyms=None):
141    """Search for a dictionary term across all chunks.
142
143    Args:
144        term_label: The term label (e.g. "Signature")
145        synonyms: Optional list of alternative forms to search
146
147    Returns:
148        List of match dicts (same format as search_chunks)
149    """
150    search_terms = [term_label]
151    if synonyms:
152        search_terms.extend(synonyms)
153
154    all_results = []
155    seen_paths = set()
156
157    for term in search_terms:
158        results = search_chunks(term, top_n=30)
159        for r in results:
160            if r['chunk_path'] not in seen_paths:
161                seen_paths.add(r['chunk_path'])
162                all_results.append(r)
163
164    all_results.sort(key=lambda x: x['relevance_score'], reverse=True)
165    return all_results[:20]
166
167
168def search_documents(query, doc_filter=None):
169    """Search across full markdown documents (not chunks).
170
171    Args:
172        query: Search string
173        doc_filter: Optional substring to filter document filenames
174
175    Returns:
176        List of dicts: {doc_path, page_refs, matched_text, relevance_score}
177    """
178    if not MD_DIR.exists():
179        return []
180
181    query_terms = [t for t in query.lower().split() if len(t) > 2]
182    if not query_terms:
183        return []
184
185    results = []
186
187    for md_file in sorted(MD_DIR.glob('*.md')):
188        if doc_filter and doc_filter.lower() not in md_file.name.lower():
189            continue
190
191        text = md_file.read_text(encoding='utf-8', errors='replace')
192        score = _score_match(text, query_terms)
193        if score == 0:
194            continue
195
196        page_refs = _extract_page_refs(text)
197        context = _context_window(text, query_terms)
198
199        results.append({
200            'doc_path': str(md_file.relative_to(BASE_DIR)),
201            'page_refs': page_refs,
202            'matched_text': context,
203            'relevance_score': score,
204        })
205
206    results.sort(key=lambda x: x['relevance_score'], reverse=True)
207    return results
208
209
210if __name__ == "__main__":
211    import sys
212    query = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else "alchemical mercury"
213    print(f"Searching chunks for: '{query}'\n")
214    results = search_chunks(query, top_n=10)
215    for i, r in enumerate(results, 1):
216        print(f"{i}. [{r['relevance_score']}] {r['source_doc']}")
217        print(f"   Section: {r['section']}")
218        print(f"   Pages: {', '.join(r['page_refs'][:5])}")
219        print(f"   Match: {r['matched_text'][:200]}...")
220        print()