Catalog Images
Parses image filenames from BL and Siena collections into the images table with folio/side metadata.
1"""Catalog all manuscript images into the database. 2 3Reads from the HIGH-QUALITY original image directories (referenced in 4manuscripts.image_dir), NOT from the compressed site/images/ copies. 5Stores both master_path (original) and web_path (compressed) for each image. 6""" 7 8import sqlite3 9import re 10import sys 11from pathlib import Path 12 13BASE_DIR = Path(__file__).resolve().parent.parent 14DB_PATH = BASE_DIR / "db" / "hp.db" 15SITE_DIR = BASE_DIR / "site" 16 17# Web image directory mapping: shelfmark -> site/images/ subfolder 18WEB_IMAGE_DIRS = { 19 'C.60.o.12': 'images/bl', 20 'O.III.38': 'images/siena', 21} 22 23 24def parse_bl_filename(filename): 25 """Parse BL C.60.o.12 image filenames. 26 27 Patterns: 28 C_60_o_12-NNN.jpg -> sequential page scan 29 BL HP NN.jpg -> marginalia detail photo 30 BL 2.1.jpg, BL2.jpg -> supplementary photos 31 """ 32 # Sequential page scans 33 m = re.match(r'C_60_o_12-(\d{3})\.jpg', filename) 34 if m: 35 seq = int(m.group(1)) 36 return { 37 'folio_number': str(seq), 38 'side': None, # not encoded in filename 39 'page_type': 'PAGE', 40 'sort_order': seq, 41 } 42 43 # Marginalia detail photos 44 m = re.match(r'BL\s*HP\s*(\d+)\.jpg', filename) 45 if m: 46 num = int(m.group(1)) 47 return { 48 'folio_number': None, 49 'side': None, 50 'page_type': 'MARGINALIA_DETAIL', 51 'sort_order': 1000 + num, 52 } 53 54 # Other BL supplementary 55 m = re.match(r'BL\s*[\d.]+\.jpg', filename) 56 if m: 57 return { 58 'folio_number': None, 59 'side': None, 60 'page_type': 'OTHER', 61 'sort_order': 2000, 62 } 63 64 return { 65 'folio_number': None, 66 'side': None, 67 'page_type': 'OTHER', 68 'sort_order': 9999, 69 } 70 71 72def parse_siena_filename(filename): 73 """Parse Siena O.III.38 image filenames. 74 75 Patterns: 76 O.III.38_NNNNr.jpg / O.III.38_NNNNv.jpg -> folio recto/verso 77 O.III.38_000antcop.jpg -> front cover 78 O.III.38_000fdg1r.jpg -> guard leaf 79 O.III.38_postcop.jpg -> back cover 80 Siena HP N.jpg / Siena Hp N.jpg -> marginalia detail 81 """ 82 # Standard folio pages 83 m = re.match(r'O\.III\.38_(\d{4})([rv])\.jpg', filename) 84 if m: 85 folio = int(m.group(1)) 86 side = m.group(2) 87 sort_order = folio * 2 + (0 if side == 'r' else 1) 88 return { 89 'folio_number': str(folio), 90 'side': side, 91 'page_type': 'PAGE', 92 'sort_order': sort_order, 93 } 94 95 # Cover and guard pages 96 if 'antcop' in filename or 'postcop' in filename: 97 sort_order = -2 if 'antcop' in filename else 99999 98 return { 99 'folio_number': None, 100 'side': None, 101 'page_type': 'COVER', 102 'sort_order': sort_order, 103 } 104 105 if 'fdg' in filename or 'risg' in filename: 106 sort_order = -1 107 return { 108 'folio_number': None, 109 'side': None, 110 'page_type': 'GUARD', 111 'sort_order': sort_order, 112 } 113 114 # Marginalia detail photos 115 m = re.match(r'Siena\s*[Hh][Pp]\s*(\d+)\.jpg', filename, re.IGNORECASE) 116 if m: 117 num = int(m.group(1)) 118 return { 119 'folio_number': None, 120 'side': None, 121 'page_type': 'MARGINALIA_DETAIL', 122 'sort_order': 100000 + num, 123 } 124 125 return { 126 'folio_number': None, 127 'side': None, 128 'page_type': 'OTHER', 129 'sort_order': 999999, 130 } 131 132 133def catalog_manuscript(conn, shelfmark, parser_fn): 134 """Catalog all images for a manuscript. 135 136 Reads from the original high-quality image directory (manuscripts.image_dir). 137 Sets master_path to the original and web_path to the compressed site copy. 138 """ 139 cur = conn.cursor() 140 cur.execute("SELECT id, image_dir FROM manuscripts WHERE shelfmark = ?", (shelfmark,)) 141 row = cur.fetchone() 142 if not row: 143 print(f" ERROR: Manuscript {shelfmark} not found in database") 144 return 0 145 146 ms_id, image_dir = row 147 img_path = BASE_DIR / image_dir 148 if not img_path.exists(): 149 print(f" ERROR: Master image directory not found: {img_path}") 150 print(f" The original high-quality images must be present.") 151 sys.exit(1) 152 153 web_dir = WEB_IMAGE_DIRS.get(shelfmark) 154 if not web_dir: 155 print(f" WARNING: No web image directory configured for {shelfmark}") 156 157 count = 0 158 missing_web = 0 159 for f in sorted(img_path.glob('*.jpg')): 160 parsed = parser_fn(f.name) 161 master = f"{image_dir}/{f.name}" 162 web = f"{web_dir}/{f.name}" if web_dir else None 163 164 # Check web copy exists 165 if web and not (SITE_DIR / f.name).parent.parent.joinpath(web).exists(): 166 # Try site dir directly 167 if not (BASE_DIR / "site" / web_dir / f.name).exists(): 168 missing_web += 1 169 170 # relative_path kept as web_path for backward compatibility 171 cur.execute( 172 """INSERT OR IGNORE INTO images 173 (manuscript_id, filename, folio_number, side, page_type, 174 sort_order, relative_path, master_path, web_path) 175 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", 176 (ms_id, f.name, parsed['folio_number'], parsed['side'], 177 parsed['page_type'], parsed['sort_order'], 178 web or master, master, web) 179 ) 180 count += 1 181 182 if missing_web: 183 print(f" WARNING: {missing_web} images lack compressed web copies in site/{web_dir}/") 184 print(f" Run compress_images.py to generate them.") 185 186 conn.commit() 187 return count 188 189 190def main(): 191 conn = sqlite3.connect(DB_PATH) 192 193 print("Cataloging BL C.60.o.12 images (from originals)...") 194 bl_count = catalog_manuscript(conn, 'C.60.o.12', parse_bl_filename) 195 print(f" {bl_count} master images cataloged") 196 197 print("Cataloging Siena O.III.38 images (from originals)...") 198 siena_count = catalog_manuscript(conn, 'O.III.38', parse_siena_filename) 199 print(f" {siena_count} master images cataloged") 200 201 # Summary stats 202 cur = conn.cursor() 203 print("\nPage type breakdown:") 204 for page_type in ['PAGE', 'MARGINALIA_DETAIL', 'COVER', 'GUARD', 'OTHER']: 205 cur.execute("SELECT COUNT(*) FROM images WHERE page_type = ?", (page_type,)) 206 print(f" {page_type}: {cur.fetchone()[0]}") 207 208 # Path coverage 209 cur.execute("SELECT COUNT(*) FROM images WHERE master_path IS NOT NULL") 210 master_count = cur.fetchone()[0] 211 cur.execute("SELECT COUNT(*) FROM images WHERE web_path IS NOT NULL") 212 web_count = cur.fetchone()[0] 213 cur.execute("SELECT COUNT(*) FROM images") 214 total = cur.fetchone()[0] 215 print(f"\nPath coverage:") 216 print(f" {master_count}/{total} images have master_path (high-quality originals)") 217 print(f" {web_count}/{total} images have web_path (compressed for site)") 218 219 conn.close() 220 print("Done.") 221 222 223if __name__ == "__main__": 224 main()