pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
This commit is contained in:
raf 2026-03-08 00:42:01 +03:00
commit d5be5026a7
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
5 changed files with 132 additions and 90 deletions

View file

@ -1,6 +1,32 @@
use std::sync::LazyLock;
use crate::error::{PinakesError, Result}; use crate::error::{PinakesError, Result};
/// Compiled ISBN extraction patterns. Compiled once, reused on every call.
static ISBN_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
[
// ISBN followed by colon or "is" with hyphens (most specific)
r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
// ISBN with just whitespace
r"ISBN(?:-13)?\s+(\d{13})",
r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
// Bare ISBN-13 with hyphens (in case "ISBN" is missing)
r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
// Bare ISBN-10 with hyphens
r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
]
.iter()
.filter_map(|p| regex::Regex::new(p).ok())
.collect()
});
/// Normalize ISBN to ISBN-13 format /// Normalize ISBN to ISBN-13 format
///
/// # Errors
///
/// Returns [`PinakesError`] if the ISBN is invalid or has an incorrect
/// checksum.
pub fn normalize_isbn(isbn: &str) -> Result<String> { pub fn normalize_isbn(isbn: &str) -> Result<String> {
// Remove hyphens, spaces, and any non-numeric characters (except X for // Remove hyphens, spaces, and any non-numeric characters (except X for
// ISBN-10) // ISBN-10)
@ -16,15 +42,13 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
Ok(clean) Ok(clean)
} else { } else {
Err(PinakesError::InvalidData(format!( Err(PinakesError::InvalidData(format!(
"Invalid ISBN-13 checksum: {}", "Invalid ISBN-13 checksum: {isbn}"
isbn
))) )))
} }
}, },
_ => { _ => {
Err(PinakesError::InvalidData(format!( Err(PinakesError::InvalidData(format!(
"Invalid ISBN length: {}", "Invalid ISBN length: {isbn}"
isbn
))) )))
}, },
} }
@ -34,8 +58,7 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
fn isbn10_to_isbn13(isbn10: &str) -> Result<String> { fn isbn10_to_isbn13(isbn10: &str) -> Result<String> {
if isbn10.len() != 10 { if isbn10.len() != 10 {
return Err(PinakesError::InvalidData(format!( return Err(PinakesError::InvalidData(format!(
"ISBN-10 must be 10 characters: {}", "ISBN-10 must be 10 characters: {isbn10}"
isbn10
))); )));
} }
@ -87,37 +110,21 @@ fn is_valid_isbn13(isbn13: &str) -> bool {
} }
/// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns) /// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns)
#[must_use]
pub fn extract_isbn_from_text(text: &str) -> Option<String> { pub fn extract_isbn_from_text(text: &str) -> Option<String> {
use regex::Regex; for pattern in ISBN_PATTERNS.iter() {
if let Some(captures) = pattern.captures(text)
// Try different patterns in order of specificity
let patterns = vec![
// ISBN followed by colon or "is" with hyphens (most specific)
r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
// ISBN with just whitespace
r"ISBN(?:-13)?\s+(\d{13})",
r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
// Bare ISBN-13 with hyphens (in case "ISBN" is missing)
r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
// Bare ISBN-10 with hyphens
r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
];
for pattern_str in patterns {
if let Ok(pattern) = Regex::new(pattern_str)
&& let Some(captures) = pattern.captures(text)
&& let Some(isbn) = captures.get(1) && let Some(isbn) = captures.get(1)
&& let Ok(normalized) = normalize_isbn(isbn.as_str()) && let Ok(normalized) = normalize_isbn(isbn.as_str())
{ {
return Some(normalized); return Some(normalized);
} }
} }
None None
} }
/// Parse author name into "Last, First" format for sorting /// Parse author name into "Last, First" format for sorting
#[must_use]
pub fn parse_author_file_as(name: &str) -> String { pub fn parse_author_file_as(name: &str) -> String {
// Simple heuristic: if already contains comma, use as-is // Simple heuristic: if already contains comma, use as-is
if name.contains(',') { if name.contains(',') {
@ -136,7 +143,7 @@ pub fn parse_author_file_as(name: &str) -> String {
return String::new(); return String::new();
}; };
let given_names = parts[..parts.len() - 1].join(" "); let given_names = parts[..parts.len() - 1].join(" ");
format!("{}, {}", surname, given_names) format!("{surname}, {given_names}")
}, },
} }
} }

View file

@ -1,6 +1,6 @@
//! MusicBrainz metadata enrichment for audio files. //! `MusicBrainz` metadata enrichment for audio files.
use std::time::Duration; use std::{fmt::Write as _, time::Duration};
use chrono::Utc; use chrono::Utc;
use uuid::Uuid; use uuid::Uuid;
@ -23,14 +23,17 @@ impl Default for MusicBrainzEnricher {
} }
impl MusicBrainzEnricher { impl MusicBrainzEnricher {
/// Create a new `MusicBrainzEnricher`.
#[must_use]
pub fn new() -> Self { pub fn new() -> Self {
let client = reqwest::Client::builder()
.user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(5))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
Self { Self {
client: reqwest::Client::builder() client,
.user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(5))
.build()
.expect("failed to build HTTP client with configured timeouts"),
base_url: "https://musicbrainz.org/ws/2".to_string(), base_url: "https://musicbrainz.org/ws/2".to_string(),
} }
} }
@ -65,7 +68,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
let mut query = format!("recording:{}", escape_lucene_query(title)); let mut query = format!("recording:{}", escape_lucene_query(title));
if let Some(ref artist) = item.artist { if let Some(ref artist) = item.artist {
query.push_str(&format!(" AND artist:{}", escape_lucene_query(artist))); let _ = write!(query, " AND artist:{}", escape_lucene_query(artist));
} }
let url = format!("{}/recording/", self.base_url); let url = format!("{}/recording/", self.base_url);
@ -113,7 +116,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
})?; })?;
let recordings = json.get("recordings").and_then(|r| r.as_array()); let recordings = json.get("recordings").and_then(|r| r.as_array());
if recordings.is_none_or(|r| r.is_empty()) { if recordings.is_none_or(std::vec::Vec::is_empty) {
return Ok(None); return Ok(None);
} }
@ -125,11 +128,12 @@ impl MetadataEnricher for MusicBrainzEnricher {
.get("id") .get("id")
.and_then(|id| id.as_str()) .and_then(|id| id.as_str())
.map(String::from); .map(String::from);
let score = recording let score = (recording
.get("score") .get("score")
.and_then(|s| s.as_f64()) .and_then(serde_json::Value::as_f64)
.unwrap_or(0.0) .unwrap_or(0.0)
/ 100.0; / 100.0)
.min(1.0);
Ok(Some(ExternalMetadata { Ok(Some(ExternalMetadata {
id: Uuid::now_v7(), id: Uuid::now_v7(),

View file

@ -2,6 +2,16 @@ use std::path::Path;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// Quote a string field for CSV output: wrap in double-quotes and escape
/// internal double-quotes by doubling them (RFC 4180).
fn csv_quote(s: &str) -> String {
if s.contains([',', '"', '\n', '\r']) {
format!("\"{}\"", s.replace('"', "\"\""))
} else {
s.to_string()
}
}
use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend}; use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -11,6 +21,11 @@ pub struct ExportResult {
} }
/// Export library data to the specified format. /// Export library data to the specified format.
///
/// # Errors
///
/// Returns an error if the storage query fails or if the output file cannot
/// be written.
pub async fn export_library( pub async fn export_library(
storage: &DynStorageBackend, storage: &DynStorageBackend,
format: &ExportFormat, format: &ExportFormat,
@ -38,27 +53,30 @@ pub async fn export_library(
album,genre,year,duration_secs,description,created_at,updated_at\n", album,genre,year,duration_secs,description,created_at,updated_at\n",
); );
for item in &items { for item in &items {
csv.push_str(&format!( use std::fmt::Write as _;
"{},{},{},{:?},{},{},{},{},{},{},{},{},{},{},{}\n", writeln!(
csv,
"{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
item.id, item.id,
item.path.display(), csv_quote(&item.path.display().to_string()),
item.file_name, csv_quote(&item.file_name),
item.media_type, csv_quote(&format!("{:?}", item.media_type)),
item.content_hash, item.content_hash,
item.file_size, item.file_size,
item.title.as_deref().unwrap_or(""), csv_quote(item.title.as_deref().unwrap_or("")),
item.artist.as_deref().unwrap_or(""), csv_quote(item.artist.as_deref().unwrap_or("")),
item.album.as_deref().unwrap_or(""), csv_quote(item.album.as_deref().unwrap_or("")),
item.genre.as_deref().unwrap_or(""), csv_quote(item.genre.as_deref().unwrap_or("")),
item.year.map(|y| y.to_string()).unwrap_or_default(), item.year.map(|y| y.to_string()).unwrap_or_default(),
item item
.duration_secs .duration_secs
.map(|d| d.to_string()) .map(|d| d.to_string())
.unwrap_or_default(), .unwrap_or_default(),
item.description.as_deref().unwrap_or(""), csv_quote(item.description.as_deref().unwrap_or("")),
item.created_at, item.created_at,
item.updated_at, item.updated_at,
)); )
.unwrap_or_default();
} }
std::fs::write(destination, csv)?; std::fs::write(destination, csv)?;
}, },

View file

@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
// Find the Info dictionary via the trailer // Find the Info dictionary via the trailer
if let Ok(info_ref) = doc.trailer.get(b"Info") { if let Ok(info_ref) = doc.trailer.get(b"Info") {
let info_obj = if let Ok(reference) = info_ref.as_reference() { let info_obj = info_ref
doc.get_object(reference).ok() .as_reference()
} else { .map_or(Some(info_ref), |reference| doc.get_object(reference).ok());
Some(info_ref)
};
if let Some(obj) = info_obj if let Some(obj) = info_obj
&& let Ok(dict) = obj.as_dict() && let Ok(dict) = obj.as_dict()
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
} }
if let Ok(author) = dict.get(b"Author") { if let Ok(author) = dict.get(b"Author") {
let author_str = pdf_object_to_string(author); let author_str = pdf_object_to_string(author);
meta.artist = author_str.clone(); meta.artist.clone_from(&author_str);
// Parse multiple authors if separated by semicolon, comma, or "and" // Parse multiple authors if separated by semicolon, comma, or "and"
if let Some(authors_str) = author_str { if let Some(authors_str) = author_str {
let author_names: Vec<String> = authors_str book_meta.authors = authors_str
.split(&[';', ','][..]) .split(&[';', ','][..])
.flat_map(|part| part.split(" and ")) .flat_map(|part| part.split(" and "))
.map(|name| name.trim().to_string()) .map(|name| name.trim().to_string())
.filter(|name| !name.is_empty()) .filter(|name| !name.is_empty())
.collect();
book_meta.authors = author_names
.into_iter()
.enumerate() .enumerate()
.map(|(pos, name)| { .map(|(pos, name)| {
let mut author = crate::model::AuthorInfo::new(name); let mut author = crate::model::AuthorInfo::new(name);
author.position = pos as i32; author.position = i32::try_from(pos).unwrap_or(i32::MAX);
author author
}) })
.collect(); .collect();
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
let pages = doc.get_pages(); let pages = doc.get_pages();
let page_count = pages.len(); let page_count = pages.len();
if page_count > 0 { if page_count > 0 {
book_meta.page_count = Some(page_count as i32); book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
} }
// Try to extract ISBN from first few pages // Try to extract ISBN from first few pages
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
// Check for role in refinements // Check for role in refinements
if let Some(role_ref) = item.refinement("role") { if let Some(role_ref) = item.refinement("role") {
author.role = role_ref.value.clone(); author.role.clone_from(&role_ref.value);
} }
authors.push(author); authors.push(author);
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
.map(|r| r.value.to_lowercase()); .map(|r| r.value.to_lowercase());
let id_type = match scheme.as_deref() { let id_type = match scheme.as_deref() {
Some("isbn") => "isbn", Some("isbn" | "isbn-10" | "isbn10") => "isbn",
Some("isbn-10") | Some("isbn10") => "isbn", Some("isbn-13" | "isbn13") => "isbn13",
Some("isbn-13") | Some("isbn13") => "isbn13",
Some("asin") => "asin", Some("asin") => "asin",
Some("doi") => "doi", Some("doi") => "doi",
_ => { _ => {
// Fallback: detect from value pattern // Fallback: detect from value pattern.
if item.value.len() == 10 // ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
|| item.value.len() == 13 // hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
|| item.value.contains('-') && item.value.len() < 20 // Parentheses required: && binds tighter than ||.
if (item.value.len() == 10 || item.value.len() == 13)
|| (item.value.contains('-')
&& (item.value.len() == 13 || item.value.len() == 17))
{ {
"isbn" "isbn"
} else { } else {
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
// DjVu files contain metadata in SEXPR (S-expression) format within // DjVu files contain metadata in SEXPR (S-expression) format within
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to // ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
// extract any metadata fields we can find. // extract any metadata fields we can find.
// Guard against loading very large DjVu files into memory.
const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
let file_meta = std::fs::metadata(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
if file_meta.len() > MAX_DJVU_SIZE {
return Ok(ExtractedMetadata::default());
}
let data = std::fs::read(path) let data = std::fs::read(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?; .map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;

View file

@ -11,7 +11,8 @@ use crate::media_type::MediaType;
pub struct MediaId(pub Uuid); pub struct MediaId(pub Uuid);
impl MediaId { impl MediaId {
/// Creates a new media ID using UUIDv7. /// Creates a new media ID using `UUIDv7`.
#[must_use]
pub fn new() -> Self { pub fn new() -> Self {
Self(Uuid::now_v7()) Self(Uuid::now_v7())
} }
@ -25,7 +26,7 @@ impl fmt::Display for MediaId {
impl Default for MediaId { impl Default for MediaId {
fn default() -> Self { fn default() -> Self {
Self::new() Self(uuid::Uuid::nil())
} }
} }
@ -35,7 +36,8 @@ pub struct ContentHash(pub String);
impl ContentHash { impl ContentHash {
/// Creates a new content hash from a hex string. /// Creates a new content hash from a hex string.
pub fn new(hex: String) -> Self { #[must_use]
pub const fn new(hex: String) -> Self {
Self(hex) Self(hex)
} }
} }
@ -75,7 +77,7 @@ impl std::str::FromStr for StorageMode {
match s.to_lowercase().as_str() { match s.to_lowercase().as_str() {
"external" => Ok(Self::External), "external" => Ok(Self::External),
"managed" => Ok(Self::Managed), "managed" => Ok(Self::Managed),
_ => Err(format!("unknown storage mode: {}", s)), _ => Err(format!("unknown storage mode: {s}")),
} }
} }
} }
@ -147,11 +149,11 @@ pub struct MediaItem {
#[serde(default)] #[serde(default)]
pub storage_mode: StorageMode, pub storage_mode: StorageMode,
/// Original filename for uploaded files (preserved separately from /// Original filename for uploaded files (preserved separately from
/// file_name) /// `file_name`)
pub original_filename: Option<String>, pub original_filename: Option<String>,
/// When the file was uploaded to managed storage /// When the file was uploaded to managed storage
pub uploaded_at: Option<DateTime<Utc>>, pub uploaded_at: Option<DateTime<Utc>>,
/// Storage key for looking up the blob (usually same as content_hash) /// Storage key for looking up the blob (usually same as `content_hash`)
pub storage_key: Option<String>, pub storage_key: Option<String>,
pub created_at: DateTime<Utc>, pub created_at: DateTime<Utc>,
@ -182,7 +184,8 @@ pub enum CustomFieldType {
} }
impl CustomFieldType { impl CustomFieldType {
pub fn as_str(&self) -> &'static str { #[must_use]
pub const fn as_str(&self) -> &'static str {
match self { match self {
Self::Text => "text", Self::Text => "text",
Self::Number => "number", Self::Number => "number",
@ -228,7 +231,8 @@ pub enum CollectionKind {
} }
impl CollectionKind { impl CollectionKind {
pub fn as_str(&self) -> &'static str { #[must_use]
pub const fn as_str(&self) -> &'static str {
match self { match self {
Self::Manual => "manual", Self::Manual => "manual",
Self::Virtual => "virtual", Self::Virtual => "virtual",
@ -380,7 +384,8 @@ pub struct Pagination {
impl Pagination { impl Pagination {
/// Creates a new pagination instance. /// Creates a new pagination instance.
pub fn new(offset: u64, limit: u64, sort: Option<String>) -> Self { #[must_use]
pub const fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
Self { Self {
offset, offset,
limit, limit,
@ -431,7 +436,7 @@ pub struct BookMetadata {
} }
/// Information about a book author. /// Information about a book author.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct AuthorInfo { pub struct AuthorInfo {
pub name: String, pub name: String,
pub role: String, pub role: String,
@ -441,6 +446,7 @@ pub struct AuthorInfo {
impl AuthorInfo { impl AuthorInfo {
/// Creates a new author with the given name. /// Creates a new author with the given name.
#[must_use]
pub fn new(name: String) -> Self { pub fn new(name: String) -> Self {
Self { Self {
name, name,
@ -451,17 +457,20 @@ impl AuthorInfo {
} }
/// Sets the author's role. /// Sets the author's role.
#[must_use]
pub fn with_role(mut self, role: String) -> Self { pub fn with_role(mut self, role: String) -> Self {
self.role = role; self.role = role;
self self
} }
#[must_use]
pub fn with_file_as(mut self, file_as: String) -> Self { pub fn with_file_as(mut self, file_as: String) -> Self {
self.file_as = Some(file_as); self.file_as = Some(file_as);
self self
} }
pub fn with_position(mut self, position: i32) -> Self { #[must_use]
pub const fn with_position(mut self, position: i32) -> Self {
self.position = position; self.position = position;
self self
} }
@ -496,21 +505,20 @@ pub struct ReadingProgress {
impl ReadingProgress { impl ReadingProgress {
/// Creates a new reading progress entry. /// Creates a new reading progress entry.
#[must_use]
pub fn new( pub fn new(
media_id: MediaId, media_id: MediaId,
user_id: Uuid, user_id: Uuid,
current_page: i32, current_page: i32,
total_pages: Option<i32>, total_pages: Option<i32>,
) -> Self { ) -> Self {
let progress_percent = if let Some(total) = total_pages { let progress_percent = total_pages.map_or(0.0, |total| {
if total > 0 { if total > 0 {
(current_page as f64 / total as f64 * 100.0).min(100.0) (f64::from(current_page) / f64::from(total) * 100.0).min(100.0)
} else { } else {
0.0 0.0
} }
} else { });
0.0
};
Self { Self {
media_id, media_id,
@ -574,7 +582,7 @@ impl std::str::FromStr for LinkType {
"wikilink" => Ok(Self::Wikilink), "wikilink" => Ok(Self::Wikilink),
"markdown_link" => Ok(Self::MarkdownLink), "markdown_link" => Ok(Self::MarkdownLink),
"embed" => Ok(Self::Embed), "embed" => Ok(Self::Embed),
_ => Err(format!("unknown link type: {}", s)), _ => Err(format!("unknown link type: {s}")),
} }
} }
} }
@ -586,7 +594,7 @@ pub struct MarkdownLink {
pub source_media_id: MediaId, pub source_media_id: MediaId,
/// Raw link target as written in the source (wikilink name or path) /// Raw link target as written in the source (wikilink name or path)
pub target_path: String, pub target_path: String,
/// Resolved target media_id (None if unresolved) /// Resolved target `media_id` (None if unresolved)
pub target_media_id: Option<MediaId>, pub target_media_id: Option<MediaId>,
pub link_type: LinkType, pub link_type: LinkType,
/// Display text for the link /// Display text for the link