use std::path::Path; use super::{ExtractedMetadata, MetadataExtractor}; use crate::{ error::{PinakesError, Result}, media_type::{BuiltinMediaType, MediaType}, }; pub struct DocumentExtractor; impl MetadataExtractor for DocumentExtractor { fn extract(&self, path: &Path) -> Result { match MediaType::from_path(path) { Some(MediaType::Builtin(BuiltinMediaType::Pdf)) => extract_pdf(path), Some(MediaType::Builtin(BuiltinMediaType::Epub)) => extract_epub(path), Some(MediaType::Builtin(BuiltinMediaType::Djvu)) => extract_djvu(path), _ => Ok(ExtractedMetadata::default()), } } fn supported_types(&self) -> Vec { vec![ MediaType::Builtin(BuiltinMediaType::Pdf), MediaType::Builtin(BuiltinMediaType::Epub), MediaType::Builtin(BuiltinMediaType::Djvu), ] } } fn extract_pdf(path: &Path) -> Result { let doc = lopdf::Document::load(path) .map_err(|e| PinakesError::MetadataExtraction(format!("PDF load: {e}")))?; let mut meta = ExtractedMetadata::default(); let mut book_meta = crate::model::BookMetadata::default(); // Find the Info dictionary via the trailer if let Ok(info_ref) = doc.trailer.get(b"Info") { let info_obj = info_ref .as_reference() .map_or(Some(info_ref), |reference| doc.get_object(reference).ok()); if let Some(obj) = info_obj && let Ok(dict) = obj.as_dict() { if let Ok(title) = dict.get(b"Title") { meta.title = pdf_object_to_string(title); } if let Ok(author) = dict.get(b"Author") { let author_str = pdf_object_to_string(author); meta.artist.clone_from(&author_str); // Parse multiple authors if separated by semicolon, comma, or "and" if let Some(authors_str) = author_str { book_meta.authors = authors_str .split(&[';', ','][..]) .flat_map(|part| part.split(" and ")) .map(|name| name.trim().to_string()) .filter(|name| !name.is_empty()) .enumerate() .map(|(pos, name)| { let mut author = crate::model::AuthorInfo::new(name); author.position = i32::try_from(pos).unwrap_or(i32::MAX); author }) .collect(); } } if let Ok(subject) = dict.get(b"Subject") { meta.description = pdf_object_to_string(subject); } if let Ok(creator) = dict.get(b"Creator") { meta.extra.insert( "creator".to_string(), pdf_object_to_string(creator).unwrap_or_default(), ); } if let Ok(producer) = dict.get(b"Producer") { meta.extra.insert( "producer".to_string(), pdf_object_to_string(producer).unwrap_or_default(), ); } } } // Page count let pages = doc.get_pages(); let page_count = pages.len(); if page_count > 0 { book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX)); } // Try to extract ISBN from first few pages // Extract text from up to the first 5 pages and search for ISBN patterns let mut extracted_text = String::new(); let max_pages = page_count.min(5); for (_page_num, page_id) in pages.iter().take(max_pages) { if let Ok(content) = doc.get_page_content(*page_id) { // PDF content streams contain raw operators, but may have text strings if let Ok(text) = std::str::from_utf8(&content) { extracted_text.push_str(text); extracted_text.push(' '); } } } // Extract ISBN from the text if let Some(isbn) = crate::books::extract_isbn_from_text(&extracted_text) && let Ok(normalized) = crate::books::normalize_isbn(&isbn) { book_meta.isbn13 = Some(normalized); book_meta.isbn = Some(isbn); } // Set format book_meta.format = Some("pdf".to_string()); meta.book_metadata = Some(book_meta); Ok(meta) } fn pdf_object_to_string(obj: &lopdf::Object) -> Option { match obj { lopdf::Object::String(bytes, _) => { Some(String::from_utf8_lossy(bytes).into_owned()) }, lopdf::Object::Name(name) => { Some(String::from_utf8_lossy(name).into_owned()) }, _ => None, } } fn extract_epub(path: &Path) -> Result { let mut doc = epub::doc::EpubDoc::new(path).map_err(|e| { PinakesError::MetadataExtraction(format!("EPUB parse: {e}")) })?; let mut meta = ExtractedMetadata { title: doc.mdata("title").map(|item| item.value.clone()), artist: doc.mdata("creator").map(|item| item.value.clone()), description: doc.mdata("description").map(|item| item.value.clone()), ..Default::default() }; let mut book_meta = crate::model::BookMetadata::default(); // Extract basic metadata if let Some(lang) = doc.mdata("language") { book_meta.language = Some(lang.value.clone()); } if let Some(publisher) = doc.mdata("publisher") { book_meta.publisher = Some(publisher.value.clone()); } if let Some(date) = doc.mdata("date") { // Try to parse as YYYY-MM-DD or just YYYY if let Ok(parsed_date) = chrono::NaiveDate::parse_from_str(&date.value, "%Y-%m-%d") { book_meta.publication_date = Some(parsed_date); } else if let Ok(year) = date.value.parse::() { book_meta.publication_date = chrono::NaiveDate::from_ymd_opt(year, 1, 1); } } // Extract authors - iterate through all metadata items let mut authors = Vec::new(); let mut position = 0; for item in &doc.metadata { if item.property == "creator" || item.property == "dc:creator" { let mut author = crate::model::AuthorInfo::new(item.value.clone()); author.position = position; position += 1; // Check for file-as in refinements if let Some(file_as_ref) = item.refinement("file-as") { author.file_as = Some(file_as_ref.value.clone()); } // Check for role in refinements if let Some(role_ref) = item.refinement("role") { author.role.clone_from(&role_ref.value); } authors.push(author); } } book_meta.authors = authors; // Extract ISBNs from identifiers let mut identifiers = std::collections::HashMap::new(); for item in &doc.metadata { if item.property == "identifier" || item.property == "dc:identifier" { // Try to get scheme from refinements let scheme = item .refinement("identifier-type") .map(|r| r.value.to_lowercase()); let id_type = match scheme.as_deref() { Some("isbn" | "isbn-10" | "isbn10") => "isbn", Some("isbn-13" | "isbn13") => "isbn13", Some("asin") => "asin", Some("doi") => "doi", _ => { // Fallback: detect from value pattern. // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare, // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9). // Parentheses required: && binds tighter than ||. if (item.value.len() == 10 || item.value.len() == 13) || (item.value.contains('-') && (item.value.len() == 13 || item.value.len() == 17)) { "isbn" } else { "other" } }, }; // Try to normalize ISBN if (id_type == "isbn" || id_type == "isbn13") && let Ok(normalized) = crate::books::normalize_isbn(&item.value) { book_meta.isbn13 = Some(normalized.clone()); book_meta.isbn = Some(item.value.clone()); } identifiers .entry(id_type.to_string()) .or_insert_with(Vec::new) .push(item.value.clone()); } } book_meta.identifiers = identifiers; // Extract Calibre series metadata by parsing the content.opf file // Try common OPF locations let opf_paths = vec!["OEBPS/content.opf", "content.opf", "OPS/content.opf"]; let mut opf_data = None; for path in opf_paths { if let Some(data) = doc.get_resource_str_by_path(path) { opf_data = Some(data); break; } } if let Some(opf_content) = opf_data { // Look for if let Some(series_start) = opf_content.find("name=\"calibre:series\"") && let Some(content_start) = opf_content[series_start..].find("content=\"") { let after_content = &opf_content[series_start + content_start + 9..]; if let Some(quote_end) = after_content.find('"') { book_meta.series_name = Some(after_content[..quote_end].to_string()); } } // Look for if let Some(index_start) = opf_content.find("name=\"calibre:series_index\"") && let Some(content_start) = opf_content[index_start..].find("content=\"") { let after_content = &opf_content[index_start + content_start + 9..]; if let Some(quote_end) = after_content.find('"') && let Ok(index) = after_content[..quote_end].parse::() { book_meta.series_index = Some(index); } } } // Set format book_meta.format = Some("epub".to_string()); meta.book_metadata = Some(book_meta); Ok(meta) } fn extract_djvu(path: &Path) -> Result { // DjVu files contain metadata in SEXPR (S-expression) format within // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to // extract any metadata fields we can find. // Guard against loading very large DjVu files into memory. const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB let file_meta = std::fs::metadata(path) .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?; if file_meta.len() > MAX_DJVU_SIZE { return Ok(ExtractedMetadata::default()); } let data = std::fs::read(path) .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?; let mut meta = ExtractedMetadata::default(); // DjVu files start with "AT&T" magic followed by FORM:DJVU or FORM:DJVM if data.len() < 16 { return Ok(meta); } // Search for metadata annotations in the file. DjVu metadata is stored // as S-expressions like (metadata (key "value") ...) within ANTa chunks. let content = String::from_utf8_lossy(&data); // Look for (metadata ...) blocks if let Some(meta_start) = content.find("(metadata") { let remainder = &content[meta_start..]; // Extract key-value pairs like (title "Some Title") extract_djvu_field(remainder, "title", &mut meta.title); extract_djvu_field(remainder, "author", &mut meta.artist); let mut desc = None; extract_djvu_field(remainder, "subject", &mut desc); if desc.is_none() { extract_djvu_field(remainder, "description", &mut desc); } meta.description = desc; let mut year_str = None; extract_djvu_field(remainder, "year", &mut year_str); if let Some(ref y) = year_str { meta.year = y.parse().ok(); } let mut creator = None; extract_djvu_field(remainder, "creator", &mut creator); if let Some(c) = creator { meta.extra.insert("creator".to_string(), c); } } // Also check for booklet-style metadata that some DjVu encoders write // outside the metadata SEXPR if meta.title.is_none() && let Some(title_start) = content.find("(bookmarks") { let remainder = &content[title_start..]; // First bookmark title is often the document title if let Some(q1) = remainder.find('"') { let after_q1 = &remainder[q1 + 1..]; if let Some(q2) = after_q1.find('"') { let val = &after_q1[..q2]; if !val.is_empty() { meta.title = Some(val.to_string()); } } } } Ok(meta) } fn extract_djvu_field(sexpr: &str, key: &str, out: &mut Option) { // Look for patterns like (key "value") in the S-expression let pattern = format!("({key}"); if let Some(start) = sexpr.find(&pattern) { let remainder = &sexpr[start + pattern.len()..]; // Find the quoted value if let Some(q1) = remainder.find('"') { let after_q1 = &remainder[q1 + 1..]; if let Some(q2) = after_q1.find('"') { let val = &after_q1[..q2]; if !val.is_empty() { *out = Some(val.to_string()); } } } } }