pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
2026-03-08 00:42:01 +03:00 · 2026-03-08 00:42:01 +03:00 · d5be5026a7
commit d5be5026a7
parent d77e5b9f2f
5 changed files with 132 additions and 90 deletions
--- a/crates/pinakes-core/src/metadata/document.rs
+++ b/crates/pinakes-core/src/metadata/document.rs
@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {

  // Find the Info dictionary via the trailer
  if let Ok(info_ref) = doc.trailer.get(b"Info") {
-    let info_obj = if let Ok(reference) = info_ref.as_reference() {
-      doc.get_object(reference).ok()
-    } else {
-      Some(info_ref)
-    };
+    let info_obj = info_ref
+      .as_reference()
+      .map_or(Some(info_ref), |reference| doc.get_object(reference).ok());

    if let Some(obj) = info_obj
      && let Ok(dict) = obj.as_dict()
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
      }
      if let Ok(author) = dict.get(b"Author") {
        let author_str = pdf_object_to_string(author);
-        meta.artist = author_str.clone();
+        meta.artist.clone_from(&author_str);

        // Parse multiple authors if separated by semicolon, comma, or "and"
        if let Some(authors_str) = author_str {
-          let author_names: Vec<String> = authors_str
+          book_meta.authors = authors_str
            .split(&[';', ','][..])
            .flat_map(|part| part.split(" and "))
            .map(|name| name.trim().to_string())
            .filter(|name| !name.is_empty())
-            .collect();
-
-          book_meta.authors = author_names
-            .into_iter()
            .enumerate()
            .map(|(pos, name)| {
              let mut author = crate::model::AuthorInfo::new(name);
-              author.position = pos as i32;
+              author.position = i32::try_from(pos).unwrap_or(i32::MAX);
              author
            })
            .collect();
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
  let pages = doc.get_pages();
  let page_count = pages.len();
  if page_count > 0 {
-    book_meta.page_count = Some(page_count as i32);
+    book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
  }

  // Try to extract ISBN from first few pages
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {

      // Check for role in refinements
      if let Some(role_ref) = item.refinement("role") {
-        author.role = role_ref.value.clone();
+        author.role.clone_from(&role_ref.value);
      }

      authors.push(author);
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
        .map(|r| r.value.to_lowercase());

      let id_type = match scheme.as_deref() {
-        Some("isbn") => "isbn",
-        Some("isbn-10") | Some("isbn10") => "isbn",
-        Some("isbn-13") | Some("isbn13") => "isbn13",
+        Some("isbn" | "isbn-10" | "isbn10") => "isbn",
+        Some("isbn-13" | "isbn13") => "isbn13",
        Some("asin") => "asin",
        Some("doi") => "doi",
        _ => {
-          // Fallback: detect from value pattern
-          if item.value.len() == 10
-            || item.value.len() == 13
-            || item.value.contains('-') && item.value.len() < 20
+          // Fallback: detect from value pattern.
+          // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
+          // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
+          // Parentheses required: && binds tighter than ||.
+          if (item.value.len() == 10 || item.value.len() == 13)
+            || (item.value.contains('-')
+              && (item.value.len() == 13 || item.value.len() == 17))
          {
            "isbn"
          } else {
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
  // DjVu files contain metadata in SEXPR (S-expression) format within
  // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
  // extract any metadata fields we can find.
+
+  // Guard against loading very large DjVu files into memory.
+  const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
+  let file_meta = std::fs::metadata(path)
+    .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
+  if file_meta.len() > MAX_DJVU_SIZE {
+    return Ok(ExtractedMetadata::default());
+  }
+
  let data = std::fs::read(path)
    .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;