From d5be5026a7e0d425e2c1550d07d1b1bdeb0bdeff Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sun, 8 Mar 2026 00:42:01 +0300 Subject: [PATCH] pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy Signed-off-by: NotAShelf Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964 --- crates/pinakes-core/src/books.rs | 61 +++++++++++-------- .../src/enrichment/musicbrainz.rs | 30 +++++---- crates/pinakes-core/src/export.rs | 40 ++++++++---- crates/pinakes-core/src/metadata/document.rs | 47 +++++++------- crates/pinakes-core/src/model.rs | 44 +++++++------ 5 files changed, 132 insertions(+), 90 deletions(-) diff --git a/crates/pinakes-core/src/books.rs b/crates/pinakes-core/src/books.rs index 92eb174..7a4a554 100644 --- a/crates/pinakes-core/src/books.rs +++ b/crates/pinakes-core/src/books.rs @@ -1,6 +1,32 @@ +use std::sync::LazyLock; + use crate::error::{PinakesError, Result}; +/// Compiled ISBN extraction patterns. Compiled once, reused on every call. +static ISBN_PATTERNS: LazyLock> = LazyLock::new(|| { + [ + // ISBN followed by colon or "is" with hyphens (most specific) + r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)", + r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])", + // ISBN with just whitespace + r"ISBN(?:-13)?\s+(\d{13})", + r"ISBN(?:-10)?\s+(\d{9}[\dXx])", + // Bare ISBN-13 with hyphens (in case "ISBN" is missing) + r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b", + // Bare ISBN-10 with hyphens + r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b", + ] + .iter() + .filter_map(|p| regex::Regex::new(p).ok()) + .collect() +}); + /// Normalize ISBN to ISBN-13 format +/// +/// # Errors +/// +/// Returns [`PinakesError`] if the ISBN is invalid or has an incorrect +/// checksum. pub fn normalize_isbn(isbn: &str) -> Result { // Remove hyphens, spaces, and any non-numeric characters (except X for // ISBN-10) @@ -16,15 +42,13 @@ pub fn normalize_isbn(isbn: &str) -> Result { Ok(clean) } else { Err(PinakesError::InvalidData(format!( - "Invalid ISBN-13 checksum: {}", - isbn + "Invalid ISBN-13 checksum: {isbn}" ))) } }, _ => { Err(PinakesError::InvalidData(format!( - "Invalid ISBN length: {}", - isbn + "Invalid ISBN length: {isbn}" ))) }, } @@ -34,8 +58,7 @@ pub fn normalize_isbn(isbn: &str) -> Result { fn isbn10_to_isbn13(isbn10: &str) -> Result { if isbn10.len() != 10 { return Err(PinakesError::InvalidData(format!( - "ISBN-10 must be 10 characters: {}", - isbn10 + "ISBN-10 must be 10 characters: {isbn10}" ))); } @@ -87,37 +110,21 @@ fn is_valid_isbn13(isbn13: &str) -> bool { } /// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns) +#[must_use] pub fn extract_isbn_from_text(text: &str) -> Option { - use regex::Regex; - - // Try different patterns in order of specificity - let patterns = vec![ - // ISBN followed by colon or "is" with hyphens (most specific) - r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)", - r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])", - // ISBN with just whitespace - r"ISBN(?:-13)?\s+(\d{13})", - r"ISBN(?:-10)?\s+(\d{9}[\dXx])", - // Bare ISBN-13 with hyphens (in case "ISBN" is missing) - r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b", - // Bare ISBN-10 with hyphens - r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b", - ]; - - for pattern_str in patterns { - if let Ok(pattern) = Regex::new(pattern_str) - && let Some(captures) = pattern.captures(text) + for pattern in ISBN_PATTERNS.iter() { + if let Some(captures) = pattern.captures(text) && let Some(isbn) = captures.get(1) && let Ok(normalized) = normalize_isbn(isbn.as_str()) { return Some(normalized); } } - None } /// Parse author name into "Last, First" format for sorting +#[must_use] pub fn parse_author_file_as(name: &str) -> String { // Simple heuristic: if already contains comma, use as-is if name.contains(',') { @@ -136,7 +143,7 @@ pub fn parse_author_file_as(name: &str) -> String { return String::new(); }; let given_names = parts[..parts.len() - 1].join(" "); - format!("{}, {}", surname, given_names) + format!("{surname}, {given_names}") }, } } diff --git a/crates/pinakes-core/src/enrichment/musicbrainz.rs b/crates/pinakes-core/src/enrichment/musicbrainz.rs index 1b315a6..9115f38 100644 --- a/crates/pinakes-core/src/enrichment/musicbrainz.rs +++ b/crates/pinakes-core/src/enrichment/musicbrainz.rs @@ -1,6 +1,6 @@ -//! MusicBrainz metadata enrichment for audio files. +//! `MusicBrainz` metadata enrichment for audio files. -use std::time::Duration; +use std::{fmt::Write as _, time::Duration}; use chrono::Utc; use uuid::Uuid; @@ -23,14 +23,17 @@ impl Default for MusicBrainzEnricher { } impl MusicBrainzEnricher { + /// Create a new `MusicBrainzEnricher`. + #[must_use] pub fn new() -> Self { + let client = reqwest::Client::builder() + .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)") + .timeout(Duration::from_secs(10)) + .connect_timeout(Duration::from_secs(5)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); Self { - client: reqwest::Client::builder() - .user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)") - .timeout(Duration::from_secs(10)) - .connect_timeout(Duration::from_secs(5)) - .build() - .expect("failed to build HTTP client with configured timeouts"), + client, base_url: "https://musicbrainz.org/ws/2".to_string(), } } @@ -65,7 +68,7 @@ impl MetadataEnricher for MusicBrainzEnricher { let mut query = format!("recording:{}", escape_lucene_query(title)); if let Some(ref artist) = item.artist { - query.push_str(&format!(" AND artist:{}", escape_lucene_query(artist))); + let _ = write!(query, " AND artist:{}", escape_lucene_query(artist)); } let url = format!("{}/recording/", self.base_url); @@ -113,7 +116,7 @@ impl MetadataEnricher for MusicBrainzEnricher { })?; let recordings = json.get("recordings").and_then(|r| r.as_array()); - if recordings.is_none_or(|r| r.is_empty()) { + if recordings.is_none_or(std::vec::Vec::is_empty) { return Ok(None); } @@ -125,11 +128,12 @@ impl MetadataEnricher for MusicBrainzEnricher { .get("id") .and_then(|id| id.as_str()) .map(String::from); - let score = recording + let score = (recording .get("score") - .and_then(|s| s.as_f64()) + .and_then(serde_json::Value::as_f64) .unwrap_or(0.0) - / 100.0; + / 100.0) + .min(1.0); Ok(Some(ExternalMetadata { id: Uuid::now_v7(), diff --git a/crates/pinakes-core/src/export.rs b/crates/pinakes-core/src/export.rs index a68bf63..50542b9 100644 --- a/crates/pinakes-core/src/export.rs +++ b/crates/pinakes-core/src/export.rs @@ -2,6 +2,16 @@ use std::path::Path; use serde::{Deserialize, Serialize}; +/// Quote a string field for CSV output: wrap in double-quotes and escape +/// internal double-quotes by doubling them (RFC 4180). +fn csv_quote(s: &str) -> String { + if s.contains([',', '"', '\n', '\r']) { + format!("\"{}\"", s.replace('"', "\"\"")) + } else { + s.to_string() + } +} + use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend}; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -11,6 +21,11 @@ pub struct ExportResult { } /// Export library data to the specified format. +/// +/// # Errors +/// +/// Returns an error if the storage query fails or if the output file cannot +/// be written. pub async fn export_library( storage: &DynStorageBackend, format: &ExportFormat, @@ -38,27 +53,30 @@ pub async fn export_library( album,genre,year,duration_secs,description,created_at,updated_at\n", ); for item in &items { - csv.push_str(&format!( - "{},{},{},{:?},{},{},{},{},{},{},{},{},{},{},{}\n", + use std::fmt::Write as _; + writeln!( + csv, + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", item.id, - item.path.display(), - item.file_name, - item.media_type, + csv_quote(&item.path.display().to_string()), + csv_quote(&item.file_name), + csv_quote(&format!("{:?}", item.media_type)), item.content_hash, item.file_size, - item.title.as_deref().unwrap_or(""), - item.artist.as_deref().unwrap_or(""), - item.album.as_deref().unwrap_or(""), - item.genre.as_deref().unwrap_or(""), + csv_quote(item.title.as_deref().unwrap_or("")), + csv_quote(item.artist.as_deref().unwrap_or("")), + csv_quote(item.album.as_deref().unwrap_or("")), + csv_quote(item.genre.as_deref().unwrap_or("")), item.year.map(|y| y.to_string()).unwrap_or_default(), item .duration_secs .map(|d| d.to_string()) .unwrap_or_default(), - item.description.as_deref().unwrap_or(""), + csv_quote(item.description.as_deref().unwrap_or("")), item.created_at, item.updated_at, - )); + ) + .unwrap_or_default(); } std::fs::write(destination, csv)?; }, diff --git a/crates/pinakes-core/src/metadata/document.rs b/crates/pinakes-core/src/metadata/document.rs index b548177..f284c51 100644 --- a/crates/pinakes-core/src/metadata/document.rs +++ b/crates/pinakes-core/src/metadata/document.rs @@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result { // Find the Info dictionary via the trailer if let Ok(info_ref) = doc.trailer.get(b"Info") { - let info_obj = if let Ok(reference) = info_ref.as_reference() { - doc.get_object(reference).ok() - } else { - Some(info_ref) - }; + let info_obj = info_ref + .as_reference() + .map_or(Some(info_ref), |reference| doc.get_object(reference).ok()); if let Some(obj) = info_obj && let Ok(dict) = obj.as_dict() @@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result { } if let Ok(author) = dict.get(b"Author") { let author_str = pdf_object_to_string(author); - meta.artist = author_str.clone(); + meta.artist.clone_from(&author_str); // Parse multiple authors if separated by semicolon, comma, or "and" if let Some(authors_str) = author_str { - let author_names: Vec = authors_str + book_meta.authors = authors_str .split(&[';', ','][..]) .flat_map(|part| part.split(" and ")) .map(|name| name.trim().to_string()) .filter(|name| !name.is_empty()) - .collect(); - - book_meta.authors = author_names - .into_iter() .enumerate() .map(|(pos, name)| { let mut author = crate::model::AuthorInfo::new(name); - author.position = pos as i32; + author.position = i32::try_from(pos).unwrap_or(i32::MAX); author }) .collect(); @@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result { let pages = doc.get_pages(); let page_count = pages.len(); if page_count > 0 { - book_meta.page_count = Some(page_count as i32); + book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX)); } // Try to extract ISBN from first few pages @@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result { // Check for role in refinements if let Some(role_ref) = item.refinement("role") { - author.role = role_ref.value.clone(); + author.role.clone_from(&role_ref.value); } authors.push(author); @@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result { .map(|r| r.value.to_lowercase()); let id_type = match scheme.as_deref() { - Some("isbn") => "isbn", - Some("isbn-10") | Some("isbn10") => "isbn", - Some("isbn-13") | Some("isbn13") => "isbn13", + Some("isbn" | "isbn-10" | "isbn10") => "isbn", + Some("isbn-13" | "isbn13") => "isbn13", Some("asin") => "asin", Some("doi") => "doi", _ => { - // Fallback: detect from value pattern - if item.value.len() == 10 - || item.value.len() == 13 - || item.value.contains('-') && item.value.len() < 20 + // Fallback: detect from value pattern. + // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare, + // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9). + // Parentheses required: && binds tighter than ||. + if (item.value.len() == 10 || item.value.len() == 13) + || (item.value.contains('-') + && (item.value.len() == 13 || item.value.len() == 17)) { "isbn" } else { @@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result { // DjVu files contain metadata in SEXPR (S-expression) format within // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to // extract any metadata fields we can find. + + // Guard against loading very large DjVu files into memory. + const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB + let file_meta = std::fs::metadata(path) + .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?; + if file_meta.len() > MAX_DJVU_SIZE { + return Ok(ExtractedMetadata::default()); + } + let data = std::fs::read(path) .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?; diff --git a/crates/pinakes-core/src/model.rs b/crates/pinakes-core/src/model.rs index 6838c6f..cedf0ef 100644 --- a/crates/pinakes-core/src/model.rs +++ b/crates/pinakes-core/src/model.rs @@ -11,7 +11,8 @@ use crate::media_type::MediaType; pub struct MediaId(pub Uuid); impl MediaId { - /// Creates a new media ID using UUIDv7. + /// Creates a new media ID using `UUIDv7`. + #[must_use] pub fn new() -> Self { Self(Uuid::now_v7()) } @@ -25,7 +26,7 @@ impl fmt::Display for MediaId { impl Default for MediaId { fn default() -> Self { - Self::new() + Self(uuid::Uuid::nil()) } } @@ -35,7 +36,8 @@ pub struct ContentHash(pub String); impl ContentHash { /// Creates a new content hash from a hex string. - pub fn new(hex: String) -> Self { + #[must_use] + pub const fn new(hex: String) -> Self { Self(hex) } } @@ -75,7 +77,7 @@ impl std::str::FromStr for StorageMode { match s.to_lowercase().as_str() { "external" => Ok(Self::External), "managed" => Ok(Self::Managed), - _ => Err(format!("unknown storage mode: {}", s)), + _ => Err(format!("unknown storage mode: {s}")), } } } @@ -147,11 +149,11 @@ pub struct MediaItem { #[serde(default)] pub storage_mode: StorageMode, /// Original filename for uploaded files (preserved separately from - /// file_name) + /// `file_name`) pub original_filename: Option, /// When the file was uploaded to managed storage pub uploaded_at: Option>, - /// Storage key for looking up the blob (usually same as content_hash) + /// Storage key for looking up the blob (usually same as `content_hash`) pub storage_key: Option, pub created_at: DateTime, @@ -182,7 +184,8 @@ pub enum CustomFieldType { } impl CustomFieldType { - pub fn as_str(&self) -> &'static str { + #[must_use] + pub const fn as_str(&self) -> &'static str { match self { Self::Text => "text", Self::Number => "number", @@ -228,7 +231,8 @@ pub enum CollectionKind { } impl CollectionKind { - pub fn as_str(&self) -> &'static str { + #[must_use] + pub const fn as_str(&self) -> &'static str { match self { Self::Manual => "manual", Self::Virtual => "virtual", @@ -380,7 +384,8 @@ pub struct Pagination { impl Pagination { /// Creates a new pagination instance. - pub fn new(offset: u64, limit: u64, sort: Option) -> Self { + #[must_use] + pub const fn new(offset: u64, limit: u64, sort: Option) -> Self { Self { offset, limit, @@ -431,7 +436,7 @@ pub struct BookMetadata { } /// Information about a book author. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct AuthorInfo { pub name: String, pub role: String, @@ -441,6 +446,7 @@ pub struct AuthorInfo { impl AuthorInfo { /// Creates a new author with the given name. + #[must_use] pub fn new(name: String) -> Self { Self { name, @@ -451,17 +457,20 @@ impl AuthorInfo { } /// Sets the author's role. + #[must_use] pub fn with_role(mut self, role: String) -> Self { self.role = role; self } + #[must_use] pub fn with_file_as(mut self, file_as: String) -> Self { self.file_as = Some(file_as); self } - pub fn with_position(mut self, position: i32) -> Self { + #[must_use] + pub const fn with_position(mut self, position: i32) -> Self { self.position = position; self } @@ -496,21 +505,20 @@ pub struct ReadingProgress { impl ReadingProgress { /// Creates a new reading progress entry. + #[must_use] pub fn new( media_id: MediaId, user_id: Uuid, current_page: i32, total_pages: Option, ) -> Self { - let progress_percent = if let Some(total) = total_pages { + let progress_percent = total_pages.map_or(0.0, |total| { if total > 0 { - (current_page as f64 / total as f64 * 100.0).min(100.0) + (f64::from(current_page) / f64::from(total) * 100.0).min(100.0) } else { 0.0 } - } else { - 0.0 - }; + }); Self { media_id, @@ -574,7 +582,7 @@ impl std::str::FromStr for LinkType { "wikilink" => Ok(Self::Wikilink), "markdown_link" => Ok(Self::MarkdownLink), "embed" => Ok(Self::Embed), - _ => Err(format!("unknown link type: {}", s)), + _ => Err(format!("unknown link type: {s}")), } } } @@ -586,7 +594,7 @@ pub struct MarkdownLink { pub source_media_id: MediaId, /// Raw link target as written in the source (wikilink name or path) pub target_path: String, - /// Resolved target media_id (None if unresolved) + /// Resolved target `media_id` (None if unresolved) pub target_media_id: Option, pub link_type: LinkType, /// Display text for the link