pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
2026-03-08 00:42:01 +03:00 · 2026-03-08 00:42:01 +03:00 · d5be5026a7
commit d5be5026a7
parent d77e5b9f2f
5 changed files with 132 additions and 90 deletions
--- a/crates/pinakes-core/src/books.rs
+++ b/crates/pinakes-core/src/books.rs
@ -1,6 +1,32 @@
+use std::sync::LazyLock;
+
 use crate::error::{PinakesError, Result};

+/// Compiled ISBN extraction patterns. Compiled once, reused on every call.
+static ISBN_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
+  [
+    // ISBN followed by colon or "is" with hyphens (most specific)
+    r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
+    r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
+    // ISBN with just whitespace
+    r"ISBN(?:-13)?\s+(\d{13})",
+    r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
+    // Bare ISBN-13 with hyphens (in case "ISBN" is missing)
+    r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
+    // Bare ISBN-10 with hyphens
+    r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
+  ]
+  .iter()
+  .filter_map(|p| regex::Regex::new(p).ok())
+  .collect()
+});
+
 /// Normalize ISBN to ISBN-13 format
+///
+/// # Errors
+///
+/// Returns [`PinakesError`] if the ISBN is invalid or has an incorrect
+/// checksum.
 pub fn normalize_isbn(isbn: &str) -> Result<String> {
  // Remove hyphens, spaces, and any non-numeric characters (except X for
  // ISBN-10)
@ -16,15 +42,13 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
        Ok(clean)
      } else {
        Err(PinakesError::InvalidData(format!(
-          "Invalid ISBN-13 checksum: {}",
-          isbn
+          "Invalid ISBN-13 checksum: {isbn}"
        )))
      }
    },
    _ => {
      Err(PinakesError::InvalidData(format!(
-        "Invalid ISBN length: {}",
-        isbn
+        "Invalid ISBN length: {isbn}"
      )))
    },
  }
@ -34,8 +58,7 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
 fn isbn10_to_isbn13(isbn10: &str) -> Result<String> {
  if isbn10.len() != 10 {
    return Err(PinakesError::InvalidData(format!(
-      "ISBN-10 must be 10 characters: {}",
-      isbn10
+      "ISBN-10 must be 10 characters: {isbn10}"
    )));
  }

@ -87,37 +110,21 @@ fn is_valid_isbn13(isbn13: &str) -> bool {
 }

 /// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns)
+#[must_use]
 pub fn extract_isbn_from_text(text: &str) -> Option<String> {
-  use regex::Regex;
-
-  // Try different patterns in order of specificity
-  let patterns = vec![
-    // ISBN followed by colon or "is" with hyphens (most specific)
-    r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
-    r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
-    // ISBN with just whitespace
-    r"ISBN(?:-13)?\s+(\d{13})",
-    r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
-    // Bare ISBN-13 with hyphens (in case "ISBN" is missing)
-    r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
-    // Bare ISBN-10 with hyphens
-    r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
-  ];
-
-  for pattern_str in patterns {
-    if let Ok(pattern) = Regex::new(pattern_str)
-      && let Some(captures) = pattern.captures(text)
+  for pattern in ISBN_PATTERNS.iter() {
+    if let Some(captures) = pattern.captures(text)
      && let Some(isbn) = captures.get(1)
      && let Ok(normalized) = normalize_isbn(isbn.as_str())
    {
      return Some(normalized);
    }
  }
-
  None
 }

 /// Parse author name into "Last, First" format for sorting
+#[must_use]
 pub fn parse_author_file_as(name: &str) -> String {
  // Simple heuristic: if already contains comma, use as-is
  if name.contains(',') {
@ -136,7 +143,7 @@ pub fn parse_author_file_as(name: &str) -> String {
        return String::new();
      };
      let given_names = parts[..parts.len() - 1].join(" ");
-      format!("{}, {}", surname, given_names)
+      format!("{surname}, {given_names}")
    },
  }
 }
--- a/crates/pinakes-core/src/enrichment/musicbrainz.rs
+++ b/crates/pinakes-core/src/enrichment/musicbrainz.rs
@ -1,6 +1,6 @@
-//! MusicBrainz metadata enrichment for audio files.
+//! `MusicBrainz` metadata enrichment for audio files.

-use std::time::Duration;
+use std::{fmt::Write as _, time::Duration};

 use chrono::Utc;
 use uuid::Uuid;
@ -23,14 +23,17 @@ impl Default for MusicBrainzEnricher {
 }

 impl MusicBrainzEnricher {
+  /// Create a new `MusicBrainzEnricher`.
+  #[must_use]
  pub fn new() -> Self {
+    let client = reqwest::Client::builder()
+      .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
+      .timeout(Duration::from_secs(10))
+      .connect_timeout(Duration::from_secs(5))
+      .build()
+      .unwrap_or_else(|_| reqwest::Client::new());
    Self {
-      client:   reqwest::Client::builder()
-        .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
-        .timeout(Duration::from_secs(10))
-        .connect_timeout(Duration::from_secs(5))
-        .build()
-        .expect("failed to build HTTP client with configured timeouts"),
+      client,
      base_url: "https://musicbrainz.org/ws/2".to_string(),
    }
  }
@ -65,7 +68,7 @@ impl MetadataEnricher for MusicBrainzEnricher {

    let mut query = format!("recording:{}", escape_lucene_query(title));
    if let Some(ref artist) = item.artist {
-      query.push_str(&format!(" AND artist:{}", escape_lucene_query(artist)));
+      let _ = write!(query, " AND artist:{}", escape_lucene_query(artist));
    }

    let url = format!("{}/recording/", self.base_url);
@ -113,7 +116,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
    })?;

    let recordings = json.get("recordings").and_then(|r| r.as_array());
-    if recordings.is_none_or(|r| r.is_empty()) {
+    if recordings.is_none_or(std::vec::Vec::is_empty) {
      return Ok(None);
    }

@ -125,11 +128,12 @@ impl MetadataEnricher for MusicBrainzEnricher {
      .get("id")
      .and_then(|id| id.as_str())
      .map(String::from);
-    let score = recording
+    let score = (recording
      .get("score")
-      .and_then(|s| s.as_f64())
+      .and_then(serde_json::Value::as_f64)
      .unwrap_or(0.0)
-      / 100.0;
+      / 100.0)
+      .min(1.0);

    Ok(Some(ExternalMetadata {
      id: Uuid::now_v7(),
--- a/crates/pinakes-core/src/export.rs
+++ b/crates/pinakes-core/src/export.rs
@ -2,6 +2,16 @@ use std::path::Path;

 use serde::{Deserialize, Serialize};

+/// Quote a string field for CSV output: wrap in double-quotes and escape
+/// internal double-quotes by doubling them (RFC 4180).
+fn csv_quote(s: &str) -> String {
+  if s.contains([',', '"', '\n', '\r']) {
+    format!("\"{}\"", s.replace('"', "\"\""))
+  } else {
+    s.to_string()
+  }
+}
+
 use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend};

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -11,6 +21,11 @@ pub struct ExportResult {
 }

 /// Export library data to the specified format.
+///
+/// # Errors
+///
+/// Returns an error if the storage query fails or if the output file cannot
+/// be written.
 pub async fn export_library(
  storage: &DynStorageBackend,
  format: &ExportFormat,
@ -38,27 +53,30 @@ pub async fn export_library(
         album,genre,year,duration_secs,description,created_at,updated_at\n",
      );
      for item in &items {
-        csv.push_str(&format!(
-          "{},{},{},{:?},{},{},{},{},{},{},{},{},{},{},{}\n",
+        use std::fmt::Write as _;
+        writeln!(
+          csv,
+          "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
          item.id,
-          item.path.display(),
-          item.file_name,
-          item.media_type,
+          csv_quote(&item.path.display().to_string()),
+          csv_quote(&item.file_name),
+          csv_quote(&format!("{:?}", item.media_type)),
          item.content_hash,
          item.file_size,
-          item.title.as_deref().unwrap_or(""),
-          item.artist.as_deref().unwrap_or(""),
-          item.album.as_deref().unwrap_or(""),
-          item.genre.as_deref().unwrap_or(""),
+          csv_quote(item.title.as_deref().unwrap_or("")),
+          csv_quote(item.artist.as_deref().unwrap_or("")),
+          csv_quote(item.album.as_deref().unwrap_or("")),
+          csv_quote(item.genre.as_deref().unwrap_or("")),
          item.year.map(|y| y.to_string()).unwrap_or_default(),
          item
            .duration_secs
            .map(|d| d.to_string())
            .unwrap_or_default(),
-          item.description.as_deref().unwrap_or(""),
+          csv_quote(item.description.as_deref().unwrap_or("")),
          item.created_at,
          item.updated_at,
-        ));
+        )
+        .unwrap_or_default();
      }
      std::fs::write(destination, csv)?;
    },
--- a/crates/pinakes-core/src/metadata/document.rs
+++ b/crates/pinakes-core/src/metadata/document.rs
@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {

  // Find the Info dictionary via the trailer
  if let Ok(info_ref) = doc.trailer.get(b"Info") {
-    let info_obj = if let Ok(reference) = info_ref.as_reference() {
-      doc.get_object(reference).ok()
-    } else {
-      Some(info_ref)
-    };
+    let info_obj = info_ref
+      .as_reference()
+      .map_or(Some(info_ref), |reference| doc.get_object(reference).ok());

    if let Some(obj) = info_obj
      && let Ok(dict) = obj.as_dict()
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
      }
      if let Ok(author) = dict.get(b"Author") {
        let author_str = pdf_object_to_string(author);
-        meta.artist = author_str.clone();
+        meta.artist.clone_from(&author_str);

        // Parse multiple authors if separated by semicolon, comma, or "and"
        if let Some(authors_str) = author_str {
-          let author_names: Vec<String> = authors_str
+          book_meta.authors = authors_str
            .split(&[';', ','][..])
            .flat_map(|part| part.split(" and "))
            .map(|name| name.trim().to_string())
            .filter(|name| !name.is_empty())
-            .collect();
-
-          book_meta.authors = author_names
-            .into_iter()
            .enumerate()
            .map(|(pos, name)| {
              let mut author = crate::model::AuthorInfo::new(name);
-              author.position = pos as i32;
+              author.position = i32::try_from(pos).unwrap_or(i32::MAX);
              author
            })
            .collect();
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
  let pages = doc.get_pages();
  let page_count = pages.len();
  if page_count > 0 {
-    book_meta.page_count = Some(page_count as i32);
+    book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
  }

  // Try to extract ISBN from first few pages
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {

      // Check for role in refinements
      if let Some(role_ref) = item.refinement("role") {
-        author.role = role_ref.value.clone();
+        author.role.clone_from(&role_ref.value);
      }

      authors.push(author);
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
        .map(|r| r.value.to_lowercase());

      let id_type = match scheme.as_deref() {
-        Some("isbn") => "isbn",
-        Some("isbn-10") | Some("isbn10") => "isbn",
-        Some("isbn-13") | Some("isbn13") => "isbn13",
+        Some("isbn" | "isbn-10" | "isbn10") => "isbn",
+        Some("isbn-13" | "isbn13") => "isbn13",
        Some("asin") => "asin",
        Some("doi") => "doi",
        _ => {
-          // Fallback: detect from value pattern
-          if item.value.len() == 10
-            || item.value.len() == 13
-            || item.value.contains('-') && item.value.len() < 20
+          // Fallback: detect from value pattern.
+          // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
+          // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
+          // Parentheses required: && binds tighter than ||.
+          if (item.value.len() == 10 || item.value.len() == 13)
+            || (item.value.contains('-')
+              && (item.value.len() == 13 || item.value.len() == 17))
          {
            "isbn"
          } else {
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
  // DjVu files contain metadata in SEXPR (S-expression) format within
  // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
  // extract any metadata fields we can find.
+
+  // Guard against loading very large DjVu files into memory.
+  const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
+  let file_meta = std::fs::metadata(path)
+    .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
+  if file_meta.len() > MAX_DJVU_SIZE {
+    return Ok(ExtractedMetadata::default());
+  }
+
  let data = std::fs::read(path)
    .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;

--- a/crates/pinakes-core/src/model.rs
+++ b/crates/pinakes-core/src/model.rs
@ -11,7 +11,8 @@ use crate::media_type::MediaType;
 pub struct MediaId(pub Uuid);

 impl MediaId {
-  /// Creates a new media ID using UUIDv7.
+  /// Creates a new media ID using `UUIDv7`.
+  #[must_use]
  pub fn new() -> Self {
    Self(Uuid::now_v7())
  }
@ -25,7 +26,7 @@ impl fmt::Display for MediaId {

 impl Default for MediaId {
  fn default() -> Self {
-    Self::new()
+    Self(uuid::Uuid::nil())
  }
 }

@ -35,7 +36,8 @@ pub struct ContentHash(pub String);

 impl ContentHash {
  /// Creates a new content hash from a hex string.
-  pub fn new(hex: String) -> Self {
+  #[must_use]
+  pub const fn new(hex: String) -> Self {
    Self(hex)
  }
 }
@ -75,7 +77,7 @@ impl std::str::FromStr for StorageMode {
    match s.to_lowercase().as_str() {
      "external" => Ok(Self::External),
      "managed" => Ok(Self::Managed),
-      _ => Err(format!("unknown storage mode: {}", s)),
+      _ => Err(format!("unknown storage mode: {s}")),
    }
  }
 }
@ -147,11 +149,11 @@ pub struct MediaItem {
  #[serde(default)]
  pub storage_mode:      StorageMode,
  /// Original filename for uploaded files (preserved separately from
-  /// file_name)
+  /// `file_name`)
  pub original_filename: Option<String>,
  /// When the file was uploaded to managed storage
  pub uploaded_at:       Option<DateTime<Utc>>,
-  /// Storage key for looking up the blob (usually same as content_hash)
+  /// Storage key for looking up the blob (usually same as `content_hash`)
  pub storage_key:       Option<String>,

  pub created_at: DateTime<Utc>,
@ -182,7 +184,8 @@ pub enum CustomFieldType {
 }

 impl CustomFieldType {
-  pub fn as_str(&self) -> &'static str {
+  #[must_use]
+  pub const fn as_str(&self) -> &'static str {
    match self {
      Self::Text => "text",
      Self::Number => "number",
@ -228,7 +231,8 @@ pub enum CollectionKind {
 }

 impl CollectionKind {
-  pub fn as_str(&self) -> &'static str {
+  #[must_use]
+  pub const fn as_str(&self) -> &'static str {
    match self {
      Self::Manual => "manual",
      Self::Virtual => "virtual",
@ -380,7 +384,8 @@ pub struct Pagination {

 impl Pagination {
  /// Creates a new pagination instance.
-  pub fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
+  #[must_use]
+  pub const fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
    Self {
      offset,
      limit,
@ -431,7 +436,7 @@ pub struct BookMetadata {
 }

 /// Information about a book author.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct AuthorInfo {
  pub name:     String,
  pub role:     String,
@ -441,6 +446,7 @@ pub struct AuthorInfo {

 impl AuthorInfo {
  /// Creates a new author with the given name.
+  #[must_use]
  pub fn new(name: String) -> Self {
    Self {
      name,
@ -451,17 +457,20 @@ impl AuthorInfo {
  }

  /// Sets the author's role.
+  #[must_use]
  pub fn with_role(mut self, role: String) -> Self {
    self.role = role;
    self
  }

+  #[must_use]
  pub fn with_file_as(mut self, file_as: String) -> Self {
    self.file_as = Some(file_as);
    self
  }

-  pub fn with_position(mut self, position: i32) -> Self {
+  #[must_use]
+  pub const fn with_position(mut self, position: i32) -> Self {
    self.position = position;
    self
  }
@ -496,21 +505,20 @@ pub struct ReadingProgress {

 impl ReadingProgress {
  /// Creates a new reading progress entry.
+  #[must_use]
  pub fn new(
    media_id: MediaId,
    user_id: Uuid,
    current_page: i32,
    total_pages: Option<i32>,
  ) -> Self {
-    let progress_percent = if let Some(total) = total_pages {
+    let progress_percent = total_pages.map_or(0.0, |total| {
      if total > 0 {
-        (current_page as f64 / total as f64 * 100.0).min(100.0)
+        (f64::from(current_page) / f64::from(total) * 100.0).min(100.0)
      } else {
        0.0
      }
-    } else {
-      0.0
-    };
+    });

    Self {
      media_id,
@ -574,7 +582,7 @@ impl std::str::FromStr for LinkType {
      "wikilink" => Ok(Self::Wikilink),
      "markdown_link" => Ok(Self::MarkdownLink),
      "embed" => Ok(Self::Embed),
-      _ => Err(format!("unknown link type: {}", s)),
+      _ => Err(format!("unknown link type: {s}")),
    }
  }
 }
@ -586,7 +594,7 @@ pub struct MarkdownLink {
  pub source_media_id: MediaId,
  /// Raw link target as written in the source (wikilink name or path)
  pub target_path:     String,
-  /// Resolved target media_id (None if unresolved)
+  /// Resolved target `media_id` (None if unresolved)
  pub target_media_id: Option<MediaId>,
  pub link_type:       LinkType,
  /// Display text for the link