treewide: complete book management interface

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: If5a21f16221f3c56a8008e139f93edc46a6a6964
2026-02-04 23:14:37 +03:00 · 2026-02-04 23:14:37 +03:00 · 2f31242442
commit 2f31242442
parent bda36ac152
23 changed files with 1693 additions and 126 deletions
--- a/crates/pinakes-core/src/metadata/document.rs
+++ b/crates/pinakes-core/src/metadata/document.rs
@ -31,6 +31,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
        .map_err(|e| PinakesError::MetadataExtraction(format!("PDF load: {e}")))?;

    let mut meta = ExtractedMetadata::default();
+    let mut book_meta = crate::model::ExtractedBookMetadata::default();

    // Find the Info dictionary via the trailer
    if let Ok(info_ref) = doc.trailer.get(b"Info") {
@ -47,7 +48,28 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
                meta.title = pdf_object_to_string(title);
            }
            if let Ok(author) = dict.get(b"Author") {
-                meta.artist = pdf_object_to_string(author);
+                let author_str = pdf_object_to_string(author);
+                meta.artist = author_str.clone();
+
+                // Parse multiple authors if separated by semicolon, comma, or "and"
+                if let Some(authors_str) = author_str {
+                    let author_names: Vec<String> = authors_str
+                        .split(&[';', ','][..])
+                        .flat_map(|part| part.split(" and "))
+                        .map(|name| name.trim().to_string())
+                        .filter(|name| !name.is_empty())
+                        .collect();
+
+                    book_meta.authors = author_names
+                        .into_iter()
+                        .enumerate()
+                        .map(|(pos, name)| {
+                            let mut author = crate::model::AuthorInfo::new(name);
+                            author.position = pos as i32;
+                            author
+                        })
+                        .collect();
+                }
            }
            if let Ok(subject) = dict.get(b"Subject") {
                meta.description = pdf_object_to_string(subject);
@ -68,12 +90,39 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
    }

    // Page count
-    let page_count = doc.get_pages().len();
+    let pages = doc.get_pages();
+    let page_count = pages.len();
    if page_count > 0 {
-        meta.extra
-            .insert("page_count".to_string(), page_count.to_string());
+        book_meta.page_count = Some(page_count as i32);
    }

+    // Try to extract ISBN from first few pages
+    // Extract text from up to the first 5 pages and search for ISBN patterns
+    let mut extracted_text = String::new();
+    let max_pages = page_count.min(5);
+
+    for (_page_num, page_id) in pages.iter().take(max_pages) {
+        if let Ok(content) = doc.get_page_content(*page_id) {
+            // PDF content streams contain raw operators, but may have text strings
+            if let Ok(text) = std::str::from_utf8(&content) {
+                extracted_text.push_str(text);
+                extracted_text.push(' ');
+            }
+        }
+    }
+
+    // Extract ISBN from the text
+    if let Some(isbn) = crate::books::extract_isbn_from_text(&extracted_text)
+        && let Ok(normalized) = crate::books::normalize_isbn(&isbn)
+    {
+        book_meta.isbn13 = Some(normalized);
+        book_meta.isbn = Some(isbn);
+    }
+
+    // Set format
+    book_meta.format = Some("pdf".to_string());
+
+    meta.book_metadata = Some(book_meta);
    Ok(meta)
 }

@ -86,7 +135,7 @@ fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
 }

 fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
-    let doc = epub::doc::EpubDoc::new(path)
+    let mut doc = epub::doc::EpubDoc::new(path)
        .map_err(|e| PinakesError::MetadataExtraction(format!("EPUB parse: {e}")))?;

    let mut meta = ExtractedMetadata {
@ -96,18 +145,131 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
        ..Default::default()
    };

+    let mut book_meta = crate::model::ExtractedBookMetadata::default();
+
+    // Extract basic metadata
    if let Some(lang) = doc.mdata("language") {
-        meta.extra
-            .insert("language".to_string(), lang.value.clone());
+        book_meta.language = Some(lang.value.clone());
    }
    if let Some(publisher) = doc.mdata("publisher") {
-        meta.extra
-            .insert("publisher".to_string(), publisher.value.clone());
+        book_meta.publisher = Some(publisher.value.clone());
    }
    if let Some(date) = doc.mdata("date") {
-        meta.extra.insert("date".to_string(), date.value.clone());
+        // Try to parse as YYYY-MM-DD or just YYYY
+        if let Ok(parsed_date) = chrono::NaiveDate::parse_from_str(&date.value, "%Y-%m-%d") {
+            book_meta.publication_date = Some(parsed_date);
+        } else if let Ok(year) = date.value.parse::<i32>() {
+            book_meta.publication_date = chrono::NaiveDate::from_ymd_opt(year, 1, 1);
+        }
    }

+    // Extract authors - iterate through all metadata items
+    let mut authors = Vec::new();
+    let mut position = 0;
+    for item in &doc.metadata {
+        if item.property == "creator" || item.property == "dc:creator" {
+            let mut author = crate::model::AuthorInfo::new(item.value.clone());
+            author.position = position;
+            position += 1;
+
+            // Check for file-as in refinements
+            if let Some(file_as_ref) = item.refinement("file-as") {
+                author.file_as = Some(file_as_ref.value.clone());
+            }
+
+            // Check for role in refinements
+            if let Some(role_ref) = item.refinement("role") {
+                author.role = role_ref.value.clone();
+            }
+
+            authors.push(author);
+        }
+    }
+    book_meta.authors = authors;
+
+    // Extract ISBNs from identifiers
+    let mut identifiers = std::collections::HashMap::new();
+    for item in &doc.metadata {
+        if item.property == "identifier" || item.property == "dc:identifier" {
+            // Try to get scheme from refinements
+            let scheme = item
+                .refinement("identifier-type")
+                .map(|r| r.value.to_lowercase());
+
+            let id_type = match scheme.as_deref() {
+                Some("isbn") => "isbn",
+                Some("isbn-10") | Some("isbn10") => "isbn",
+                Some("isbn-13") | Some("isbn13") => "isbn13",
+                Some("asin") => "asin",
+                Some("doi") => "doi",
+                _ => {
+                    // Fallback: detect from value pattern
+                    if item.value.len() == 10
+                        || item.value.len() == 13
+                        || item.value.contains('-') && item.value.len() < 20
+                    {
+                        "isbn"
+                    } else {
+                        "other"
+                    }
+                }
+            };
+
+            // Try to normalize ISBN
+            if (id_type == "isbn" || id_type == "isbn13")
+                && let Ok(normalized) = crate::books::normalize_isbn(&item.value)
+            {
+                book_meta.isbn13 = Some(normalized.clone());
+                book_meta.isbn = Some(item.value.clone());
+            }
+
+            identifiers
+                .entry(id_type.to_string())
+                .or_insert_with(Vec::new)
+                .push(item.value.clone());
+        }
+    }
+    book_meta.identifiers = identifiers;
+
+    // Extract Calibre series metadata by parsing the content.opf file
+    // Try common OPF locations
+    let opf_paths = vec!["OEBPS/content.opf", "content.opf", "OPS/content.opf"];
+    let mut opf_data = None;
+    for path in opf_paths {
+        if let Some(data) = doc.get_resource_str_by_path(path) {
+            opf_data = Some(data);
+            break;
+        }
+    }
+
+    if let Some(opf_content) = opf_data {
+        // Look for <meta name="calibre:series" content="Series Name"/>
+        if let Some(series_start) = opf_content.find("name=\"calibre:series\"")
+            && let Some(content_start) = opf_content[series_start..].find("content=\"")
+        {
+            let after_content = &opf_content[series_start + content_start + 9..];
+            if let Some(quote_end) = after_content.find('"') {
+                book_meta.series_name = Some(after_content[..quote_end].to_string());
+            }
+        }
+
+        // Look for <meta name="calibre:series_index" content="1.0"/>
+        if let Some(index_start) = opf_content.find("name=\"calibre:series_index\"")
+            && let Some(content_start) = opf_content[index_start..].find("content=\"")
+        {
+            let after_content = &opf_content[index_start + content_start + 9..];
+            if let Some(quote_end) = after_content.find('"')
+                && let Ok(index) = after_content[..quote_end].parse::<f64>()
+            {
+                book_meta.series_index = Some(index);
+            }
+        }
+    }
+
+    // Set format
+    book_meta.format = Some("epub".to_string());
+
+    meta.book_metadata = Some(book_meta);
    Ok(meta)
 }