From d5be5026a7e0d425e2c1550d07d1b1bdeb0bdeff Mon Sep 17 00:00:00 2001
From: NotAShelf <raf@notashelf.dev>
Date: Sun, 8 Mar 2026 00:42:01 +0300
Subject: [PATCH] pinakes-core: fix isbn regex, csv quoting, document
 extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
---
 crates/pinakes-core/src/books.rs              | 61 +++++++++++--------
 .../src/enrichment/musicbrainz.rs             | 30 +++++----
 crates/pinakes-core/src/export.rs             | 40 ++++++++----
 crates/pinakes-core/src/metadata/document.rs  | 47 +++++++-------
 crates/pinakes-core/src/model.rs              | 44 +++++++------
 5 files changed, 132 insertions(+), 90 deletions(-)
diff --git a/crates/pinakes-core/src/books.rs b/crates/pinakes-core/src/books.rs
index 92eb174..7a4a554 100644
--- a/crates/pinakes-core/src/books.rs
+++ b/crates/pinakes-core/src/books.rs
@@ -1,6 +1,32 @@
+use std::sync::LazyLock;
+
 use crate::error::{PinakesError, Result};
 
+/// Compiled ISBN extraction patterns. Compiled once, reused on every call.
+static ISBN_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
+  [
+    // ISBN followed by colon or "is" with hyphens (most specific)
+    r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
+    r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
+    // ISBN with just whitespace
+    r"ISBN(?:-13)?\s+(\d{13})",
+    r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
+    // Bare ISBN-13 with hyphens (in case "ISBN" is missing)
+    r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
+    // Bare ISBN-10 with hyphens
+    r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
+  ]
+  .iter()
+  .filter_map(|p| regex::Regex::new(p).ok())
+  .collect()
+});
+
 /// Normalize ISBN to ISBN-13 format
+///
+/// # Errors
+///
+/// Returns [`PinakesError`] if the ISBN is invalid or has an incorrect
+/// checksum.
 pub fn normalize_isbn(isbn: &str) -> Result<String> {
   // Remove hyphens, spaces, and any non-numeric characters (except X for
   // ISBN-10)
@@ -16,15 +42,13 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
         Ok(clean)
       } else {
         Err(PinakesError::InvalidData(format!(
-          "Invalid ISBN-13 checksum: {}",
-          isbn
+          "Invalid ISBN-13 checksum: {isbn}"
         )))
       }
     },
     _ => {
       Err(PinakesError::InvalidData(format!(
-        "Invalid ISBN length: {}",
-        isbn
+        "Invalid ISBN length: {isbn}"
       )))
     },
   }
@@ -34,8 +58,7 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
 fn isbn10_to_isbn13(isbn10: &str) -> Result<String> {
   if isbn10.len() != 10 {
     return Err(PinakesError::InvalidData(format!(
-      "ISBN-10 must be 10 characters: {}",
-      isbn10
+      "ISBN-10 must be 10 characters: {isbn10}"
     )));
   }
 
@@ -87,37 +110,21 @@ fn is_valid_isbn13(isbn13: &str) -> bool {
 }
 
 /// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns)
+#[must_use]
 pub fn extract_isbn_from_text(text: &str) -> Option<String> {
-  use regex::Regex;
-
-  // Try different patterns in order of specificity
-  let patterns = vec![
-    // ISBN followed by colon or "is" with hyphens (most specific)
-    r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
-    r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
-    // ISBN with just whitespace
-    r"ISBN(?:-13)?\s+(\d{13})",
-    r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
-    // Bare ISBN-13 with hyphens (in case "ISBN" is missing)
-    r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
-    // Bare ISBN-10 with hyphens
-    r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
-  ];
-
-  for pattern_str in patterns {
-    if let Ok(pattern) = Regex::new(pattern_str)
-      && let Some(captures) = pattern.captures(text)
+  for pattern in ISBN_PATTERNS.iter() {
+    if let Some(captures) = pattern.captures(text)
       && let Some(isbn) = captures.get(1)
       && let Ok(normalized) = normalize_isbn(isbn.as_str())
     {
       return Some(normalized);
     }
   }
-
   None
 }
 
 /// Parse author name into "Last, First" format for sorting
+#[must_use]
 pub fn parse_author_file_as(name: &str) -> String {
   // Simple heuristic: if already contains comma, use as-is
   if name.contains(',') {
@@ -136,7 +143,7 @@ pub fn parse_author_file_as(name: &str) -> String {
         return String::new();
       };
       let given_names = parts[..parts.len() - 1].join(" ");
-      format!("{}, {}", surname, given_names)
+      format!("{surname}, {given_names}")
     },
   }
 }
diff --git a/crates/pinakes-core/src/enrichment/musicbrainz.rs b/crates/pinakes-core/src/enrichment/musicbrainz.rs
index 1b315a6..9115f38 100644
--- a/crates/pinakes-core/src/enrichment/musicbrainz.rs
+++ b/crates/pinakes-core/src/enrichment/musicbrainz.rs
@@ -1,6 +1,6 @@
-//! MusicBrainz metadata enrichment for audio files.
+//! `MusicBrainz` metadata enrichment for audio files.
 
-use std::time::Duration;
+use std::{fmt::Write as _, time::Duration};
 
 use chrono::Utc;
 use uuid::Uuid;
@@ -23,14 +23,17 @@ impl Default for MusicBrainzEnricher {
 }
 
 impl MusicBrainzEnricher {
+  /// Create a new `MusicBrainzEnricher`.
+  #[must_use]
   pub fn new() -> Self {
+    let client = reqwest::Client::builder()
+      .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
+      .timeout(Duration::from_secs(10))
+      .connect_timeout(Duration::from_secs(5))
+      .build()
+      .unwrap_or_else(|_| reqwest::Client::new());
     Self {
-      client:   reqwest::Client::builder()
-        .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
-        .timeout(Duration::from_secs(10))
-        .connect_timeout(Duration::from_secs(5))
-        .build()
-        .expect("failed to build HTTP client with configured timeouts"),
+      client,
       base_url: "https://musicbrainz.org/ws/2".to_string(),
     }
   }
@@ -65,7 +68,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
 
     let mut query = format!("recording:{}", escape_lucene_query(title));
     if let Some(ref artist) = item.artist {
-      query.push_str(&format!(" AND artist:{}", escape_lucene_query(artist)));
+      let _ = write!(query, " AND artist:{}", escape_lucene_query(artist));
     }
 
     let url = format!("{}/recording/", self.base_url);
@@ -113,7 +116,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
     })?;
 
     let recordings = json.get("recordings").and_then(|r| r.as_array());
-    if recordings.is_none_or(|r| r.is_empty()) {
+    if recordings.is_none_or(std::vec::Vec::is_empty) {
       return Ok(None);
     }
 
@@ -125,11 +128,12 @@ impl MetadataEnricher for MusicBrainzEnricher {
       .get("id")
       .and_then(|id| id.as_str())
       .map(String::from);
-    let score = recording
+    let score = (recording
       .get("score")
-      .and_then(|s| s.as_f64())
+      .and_then(serde_json::Value::as_f64)
       .unwrap_or(0.0)
-      / 100.0;
+      / 100.0)
+      .min(1.0);
 
     Ok(Some(ExternalMetadata {
       id: Uuid::now_v7(),
diff --git a/crates/pinakes-core/src/export.rs b/crates/pinakes-core/src/export.rs
index a68bf63..50542b9 100644
--- a/crates/pinakes-core/src/export.rs
+++ b/crates/pinakes-core/src/export.rs
@@ -2,6 +2,16 @@ use std::path::Path;
 
 use serde::{Deserialize, Serialize};
 
+/// Quote a string field for CSV output: wrap in double-quotes and escape
+/// internal double-quotes by doubling them (RFC 4180).
+fn csv_quote(s: &str) -> String {
+  if s.contains([',', '"', '\n', '\r']) {
+    format!("\"{}\"", s.replace('"', "\"\""))
+  } else {
+    s.to_string()
+  }
+}
+
 use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -11,6 +21,11 @@ pub struct ExportResult {
 }
 
 /// Export library data to the specified format.
+///
+/// # Errors
+///
+/// Returns an error if the storage query fails or if the output file cannot
+/// be written.
 pub async fn export_library(
   storage: &DynStorageBackend,
   format: &ExportFormat,
@@ -38,27 +53,30 @@ pub async fn export_library(
          album,genre,year,duration_secs,description,created_at,updated_at\n",
       );
       for item in &items {
-        csv.push_str(&format!(
-          "{},{},{},{:?},{},{},{},{},{},{},{},{},{},{},{}\n",
+        use std::fmt::Write as _;
+        writeln!(
+          csv,
+          "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
           item.id,
-          item.path.display(),
-          item.file_name,
-          item.media_type,
+          csv_quote(&item.path.display().to_string()),
+          csv_quote(&item.file_name),
+          csv_quote(&format!("{:?}", item.media_type)),
           item.content_hash,
           item.file_size,
-          item.title.as_deref().unwrap_or(""),
-          item.artist.as_deref().unwrap_or(""),
-          item.album.as_deref().unwrap_or(""),
-          item.genre.as_deref().unwrap_or(""),
+          csv_quote(item.title.as_deref().unwrap_or("")),
+          csv_quote(item.artist.as_deref().unwrap_or("")),
+          csv_quote(item.album.as_deref().unwrap_or("")),
+          csv_quote(item.genre.as_deref().unwrap_or("")),
           item.year.map(|y| y.to_string()).unwrap_or_default(),
           item
             .duration_secs
             .map(|d| d.to_string())
             .unwrap_or_default(),
-          item.description.as_deref().unwrap_or(""),
+          csv_quote(item.description.as_deref().unwrap_or("")),
           item.created_at,
           item.updated_at,
-        ));
+        )
+        .unwrap_or_default();
       }
       std::fs::write(destination, csv)?;
     },
diff --git a/crates/pinakes-core/src/metadata/document.rs b/crates/pinakes-core/src/metadata/document.rs
index b548177..f284c51 100644
--- a/crates/pinakes-core/src/metadata/document.rs
+++ b/crates/pinakes-core/src/metadata/document.rs
@@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
 
   // Find the Info dictionary via the trailer
   if let Ok(info_ref) = doc.trailer.get(b"Info") {
-    let info_obj = if let Ok(reference) = info_ref.as_reference() {
-      doc.get_object(reference).ok()
-    } else {
-      Some(info_ref)
-    };
+    let info_obj = info_ref
+      .as_reference()
+      .map_or(Some(info_ref), |reference| doc.get_object(reference).ok());
 
     if let Some(obj) = info_obj
       && let Ok(dict) = obj.as_dict()
@@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
       }
       if let Ok(author) = dict.get(b"Author") {
         let author_str = pdf_object_to_string(author);
-        meta.artist = author_str.clone();
+        meta.artist.clone_from(&author_str);
 
         // Parse multiple authors if separated by semicolon, comma, or "and"
         if let Some(authors_str) = author_str {
-          let author_names: Vec<String> = authors_str
+          book_meta.authors = authors_str
             .split(&[';', ','][..])
             .flat_map(|part| part.split(" and "))
             .map(|name| name.trim().to_string())
             .filter(|name| !name.is_empty())
-            .collect();
-
-          book_meta.authors = author_names
-            .into_iter()
             .enumerate()
             .map(|(pos, name)| {
               let mut author = crate::model::AuthorInfo::new(name);
-              author.position = pos as i32;
+              author.position = i32::try_from(pos).unwrap_or(i32::MAX);
               author
             })
             .collect();
@@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
   let pages = doc.get_pages();
   let page_count = pages.len();
   if page_count > 0 {
-    book_meta.page_count = Some(page_count as i32);
+    book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
   }
 
   // Try to extract ISBN from first few pages
@@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
 
       // Check for role in refinements
       if let Some(role_ref) = item.refinement("role") {
-        author.role = role_ref.value.clone();
+        author.role.clone_from(&role_ref.value);
       }
 
       authors.push(author);
@@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
         .map(|r| r.value.to_lowercase());
 
       let id_type = match scheme.as_deref() {
-        Some("isbn") => "isbn",
-        Some("isbn-10") | Some("isbn10") => "isbn",
-        Some("isbn-13") | Some("isbn13") => "isbn13",
+        Some("isbn" | "isbn-10" | "isbn10") => "isbn",
+        Some("isbn-13" | "isbn13") => "isbn13",
         Some("asin") => "asin",
         Some("doi") => "doi",
         _ => {
-          // Fallback: detect from value pattern
-          if item.value.len() == 10
-            || item.value.len() == 13
-            || item.value.contains('-') && item.value.len() < 20
+          // Fallback: detect from value pattern.
+          // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
+          // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
+          // Parentheses required: && binds tighter than ||.
+          if (item.value.len() == 10 || item.value.len() == 13)
+            || (item.value.contains('-')
+              && (item.value.len() == 13 || item.value.len() == 17))
           {
             "isbn"
           } else {
@@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
   // DjVu files contain metadata in SEXPR (S-expression) format within
   // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
   // extract any metadata fields we can find.
+
+  // Guard against loading very large DjVu files into memory.
+  const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
+  let file_meta = std::fs::metadata(path)
+    .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
+  if file_meta.len() > MAX_DJVU_SIZE {
+    return Ok(ExtractedMetadata::default());
+  }
+
   let data = std::fs::read(path)
     .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;
 
diff --git a/crates/pinakes-core/src/model.rs b/crates/pinakes-core/src/model.rs
index 6838c6f..cedf0ef 100644
--- a/crates/pinakes-core/src/model.rs
+++ b/crates/pinakes-core/src/model.rs
@@ -11,7 +11,8 @@ use crate::media_type::MediaType;
 pub struct MediaId(pub Uuid);
 
 impl MediaId {
-  /// Creates a new media ID using UUIDv7.
+  /// Creates a new media ID using `UUIDv7`.
+  #[must_use]
   pub fn new() -> Self {
     Self(Uuid::now_v7())
   }
@@ -25,7 +26,7 @@ impl fmt::Display for MediaId {
 
 impl Default for MediaId {
   fn default() -> Self {
-    Self::new()
+    Self(uuid::Uuid::nil())
   }
 }
 
@@ -35,7 +36,8 @@ pub struct ContentHash(pub String);
 
 impl ContentHash {
   /// Creates a new content hash from a hex string.
-  pub fn new(hex: String) -> Self {
+  #[must_use]
+  pub const fn new(hex: String) -> Self {
     Self(hex)
   }
 }
@@ -75,7 +77,7 @@ impl std::str::FromStr for StorageMode {
     match s.to_lowercase().as_str() {
       "external" => Ok(Self::External),
       "managed" => Ok(Self::Managed),
-      _ => Err(format!("unknown storage mode: {}", s)),
+      _ => Err(format!("unknown storage mode: {s}")),
     }
   }
 }
@@ -147,11 +149,11 @@ pub struct MediaItem {
   #[serde(default)]
   pub storage_mode:      StorageMode,
   /// Original filename for uploaded files (preserved separately from
-  /// file_name)
+  /// `file_name`)
   pub original_filename: Option<String>,
   /// When the file was uploaded to managed storage
   pub uploaded_at:       Option<DateTime<Utc>>,
-  /// Storage key for looking up the blob (usually same as content_hash)
+  /// Storage key for looking up the blob (usually same as `content_hash`)
   pub storage_key:       Option<String>,
 
   pub created_at: DateTime<Utc>,
@@ -182,7 +184,8 @@ pub enum CustomFieldType {
 }
 
 impl CustomFieldType {
-  pub fn as_str(&self) -> &'static str {
+  #[must_use]
+  pub const fn as_str(&self) -> &'static str {
     match self {
       Self::Text => "text",
       Self::Number => "number",
@@ -228,7 +231,8 @@ pub enum CollectionKind {
 }
 
 impl CollectionKind {
-  pub fn as_str(&self) -> &'static str {
+  #[must_use]
+  pub const fn as_str(&self) -> &'static str {
     match self {
       Self::Manual => "manual",
       Self::Virtual => "virtual",
@@ -380,7 +384,8 @@ pub struct Pagination {
 
 impl Pagination {
   /// Creates a new pagination instance.
-  pub fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
+  #[must_use]
+  pub const fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
     Self {
       offset,
       limit,
@@ -431,7 +436,7 @@ pub struct BookMetadata {
 }
 
 /// Information about a book author.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct AuthorInfo {
   pub name:     String,
   pub role:     String,
@@ -441,6 +446,7 @@ pub struct AuthorInfo {
 
 impl AuthorInfo {
   /// Creates a new author with the given name.
+  #[must_use]
   pub fn new(name: String) -> Self {
     Self {
       name,
@@ -451,17 +457,20 @@ impl AuthorInfo {
   }
 
   /// Sets the author's role.
+  #[must_use]
   pub fn with_role(mut self, role: String) -> Self {
     self.role = role;
     self
   }
 
+  #[must_use]
   pub fn with_file_as(mut self, file_as: String) -> Self {
     self.file_as = Some(file_as);
     self
   }
 
-  pub fn with_position(mut self, position: i32) -> Self {
+  #[must_use]
+  pub const fn with_position(mut self, position: i32) -> Self {
     self.position = position;
     self
   }
@@ -496,21 +505,20 @@ pub struct ReadingProgress {
 
 impl ReadingProgress {
   /// Creates a new reading progress entry.
+  #[must_use]
   pub fn new(
     media_id: MediaId,
     user_id: Uuid,
     current_page: i32,
     total_pages: Option<i32>,
   ) -> Self {
-    let progress_percent = if let Some(total) = total_pages {
+    let progress_percent = total_pages.map_or(0.0, |total| {
       if total > 0 {
-        (current_page as f64 / total as f64 * 100.0).min(100.0)
+        (f64::from(current_page) / f64::from(total) * 100.0).min(100.0)
       } else {
         0.0
       }
-    } else {
-      0.0
-    };
+    });
 
     Self {
       media_id,
@@ -574,7 +582,7 @@ impl std::str::FromStr for LinkType {
       "wikilink" => Ok(Self::Wikilink),
       "markdown_link" => Ok(Self::MarkdownLink),
       "embed" => Ok(Self::Embed),
-      _ => Err(format!("unknown link type: {}", s)),
+      _ => Err(format!("unknown link type: {s}")),
     }
   }
 }
@@ -586,7 +594,7 @@ pub struct MarkdownLink {
   pub source_media_id: MediaId,
   /// Raw link target as written in the source (wikilink name or path)
   pub target_path:     String,
-  /// Resolved target media_id (None if unresolved)
+  /// Resolved target `media_id` (None if unresolved)
   pub target_media_id: Option<MediaId>,
   pub link_type:       LinkType,
   /// Display text for the link