Corpus Search
Keyword-based search across markdown chunks and documents with provenance tracking.
1"""Corpus search utilities: search across markdown chunks and full documents. 2 3Provides keyword-based search across the /chunks/ and /md/ directories 4with provenance tracking. No embeddings required. 5 6Functions: 7 search_chunks(query, top_n=20) -> list of match dicts 8 search_by_term(term_slug, synonyms=None) -> list of match dicts 9 search_documents(query, doc_filter=None) -> list of match dicts 10""" 11 12import re 13import os 14from pathlib import Path 15 16BASE_DIR = Path(__file__).resolve().parent.parent 17CHUNKS_DIR = BASE_DIR / "chunks" 18MD_DIR = BASE_DIR / "md" 19 20 21def _parse_chunk_frontmatter(text): 22 """Extract YAML frontmatter from a chunk file.""" 23 meta = {} 24 if text.startswith('---'): 25 end = text.find('---', 3) 26 if end > 0: 27 for line in text[3:end].strip().split('\n'): 28 if ':' in line: 29 key, val = line.split(':', 1) 30 val = val.strip().strip('"').strip("'") 31 meta[key.strip()] = val 32 return meta 33 34 35def _extract_page_refs(text): 36 """Extract page markers from chunk text.""" 37 return re.findall(r'<!-- Page (\d+) -->', text) 38 39 40def _score_match(text, query_terms): 41 """Score a text block by frequency and proximity of query terms.""" 42 text_lower = text.lower() 43 score = 0 44 for term in query_terms: 45 count = text_lower.count(term.lower()) 46 score += count 47 return score 48 49 50def _context_window(text, query_terms, window=300): 51 """Extract the best context window around the query terms.""" 52 text_lower = text.lower() 53 best_pos = -1 54 best_score = 0 55 56 for term in query_terms: 57 pos = text_lower.find(term.lower()) 58 if pos >= 0: 59 # Score this position by counting nearby term occurrences 60 start = max(0, pos - window) 61 end = min(len(text), pos + window) 62 snippet = text_lower[start:end] 63 score = sum(snippet.count(t.lower()) for t in query_terms) 64 if score > best_score: 65 best_score = score 66 best_pos = pos 67 68 if best_pos < 0: 69 return text[:window * 2] if len(text) > window * 2 else text 70 71 start = max(0, best_pos - window) 72 end = min(len(text), best_pos + window) 73 snippet = text[start:end].strip() 74 75 # Clean up snippet boundaries 76 if start > 0: 77 first_space = snippet.find(' ') 78 if first_space > 0: 79 snippet = '...' + snippet[first_space:] 80 if end < len(text): 81 last_space = snippet.rfind(' ') 82 if last_space > 0: 83 snippet = snippet[:last_space] + '...' 84 85 return snippet 86 87 88def search_chunks(query, top_n=20): 89 """Search across all chunk files for passages matching query. 90 91 Args: 92 query: Search string (can be multiple words) 93 top_n: Maximum results to return 94 95 Returns: 96 List of dicts: {chunk_path, source_doc, section, page_refs, 97 matched_text, relevance_score, word_count} 98 """ 99 if not CHUNKS_DIR.exists(): 100 return [] 101 102 query_terms = [t for t in query.lower().split() if len(t) > 2] 103 if not query_terms: 104 return [] 105 106 results = [] 107 108 for doc_dir in sorted(CHUNKS_DIR.iterdir()): 109 if not doc_dir.is_dir(): 110 continue 111 for chunk_file in sorted(doc_dir.glob('chunk_*.md')): 112 text = chunk_file.read_text(encoding='utf-8', errors='replace') 113 meta = _parse_chunk_frontmatter(text) 114 115 # Strip frontmatter for content search 116 body_start = text.find('---', 3) 117 body = text[body_start + 3:].strip() if body_start > 0 else text 118 119 score = _score_match(body, query_terms) 120 if score == 0: 121 continue 122 123 page_refs = _extract_page_refs(body) 124 context = _context_window(body, query_terms) 125 126 results.append({ 127 'chunk_path': str(chunk_file.relative_to(BASE_DIR)), 128 'source_doc': meta.get('source', str(doc_dir.name)), 129 'section': meta.get('section', ''), 130 'page_refs': page_refs, 131 'matched_text': context, 132 'relevance_score': score, 133 'word_count': int(meta.get('word_count', 0)), 134 }) 135 136 results.sort(key=lambda x: x['relevance_score'], reverse=True) 137 return results[:top_n] 138 139 140def search_by_term(term_label, synonyms=None): 141 """Search for a dictionary term across all chunks. 142 143 Args: 144 term_label: The term label (e.g. "Signature") 145 synonyms: Optional list of alternative forms to search 146 147 Returns: 148 List of match dicts (same format as search_chunks) 149 """ 150 search_terms = [term_label] 151 if synonyms: 152 search_terms.extend(synonyms) 153 154 all_results = [] 155 seen_paths = set() 156 157 for term in search_terms: 158 results = search_chunks(term, top_n=30) 159 for r in results: 160 if r['chunk_path'] not in seen_paths: 161 seen_paths.add(r['chunk_path']) 162 all_results.append(r) 163 164 all_results.sort(key=lambda x: x['relevance_score'], reverse=True) 165 return all_results[:20] 166 167 168def search_documents(query, doc_filter=None): 169 """Search across full markdown documents (not chunks). 170 171 Args: 172 query: Search string 173 doc_filter: Optional substring to filter document filenames 174 175 Returns: 176 List of dicts: {doc_path, page_refs, matched_text, relevance_score} 177 """ 178 if not MD_DIR.exists(): 179 return [] 180 181 query_terms = [t for t in query.lower().split() if len(t) > 2] 182 if not query_terms: 183 return [] 184 185 results = [] 186 187 for md_file in sorted(MD_DIR.glob('*.md')): 188 if doc_filter and doc_filter.lower() not in md_file.name.lower(): 189 continue 190 191 text = md_file.read_text(encoding='utf-8', errors='replace') 192 score = _score_match(text, query_terms) 193 if score == 0: 194 continue 195 196 page_refs = _extract_page_refs(text) 197 context = _context_window(text, query_terms) 198 199 results.append({ 200 'doc_path': str(md_file.relative_to(BASE_DIR)), 201 'page_refs': page_refs, 202 'matched_text': context, 203 'relevance_score': score, 204 }) 205 206 results.sort(key=lambda x: x['relevance_score'], reverse=True) 207 return results 208 209 210if __name__ == "__main__": 211 import sys 212 query = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else "alchemical mercury" 213 print(f"Searching chunks for: '{query}'\n") 214 results = search_chunks(query, top_n=10) 215 for i, r in enumerate(results, 1): 216 print(f"{i}. [{r['relevance_score']}] {r['source_doc']}") 217 print(f" Section: {r['section']}") 218 print(f" Pages: {', '.join(r['page_refs'][:5])}") 219 print(f" Match: {r['matched_text'][:200]}...") 220 print()