pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
This commit is contained in:
raf 2026-03-08 00:42:01 +03:00
commit d5be5026a7
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
5 changed files with 132 additions and 90 deletions

View file

@ -1,6 +1,32 @@
use std::sync::LazyLock;
use crate::error::{PinakesError, Result};
/// Compiled ISBN extraction patterns. Compiled once, reused on every call.
static ISBN_PATTERNS: LazyLock<Vec<regex::Regex>> = LazyLock::new(|| {
[
// ISBN followed by colon or "is" with hyphens (most specific)
r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
// ISBN with just whitespace
r"ISBN(?:-13)?\s+(\d{13})",
r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
// Bare ISBN-13 with hyphens (in case "ISBN" is missing)
r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
// Bare ISBN-10 with hyphens
r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
]
.iter()
.filter_map(|p| regex::Regex::new(p).ok())
.collect()
});
/// Normalize ISBN to ISBN-13 format
///
/// # Errors
///
/// Returns [`PinakesError`] if the ISBN is invalid or has an incorrect
/// checksum.
pub fn normalize_isbn(isbn: &str) -> Result<String> {
// Remove hyphens, spaces, and any non-numeric characters (except X for
// ISBN-10)
@ -16,15 +42,13 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
Ok(clean)
} else {
Err(PinakesError::InvalidData(format!(
"Invalid ISBN-13 checksum: {}",
isbn
"Invalid ISBN-13 checksum: {isbn}"
)))
}
},
_ => {
Err(PinakesError::InvalidData(format!(
"Invalid ISBN length: {}",
isbn
"Invalid ISBN length: {isbn}"
)))
},
}
@ -34,8 +58,7 @@ pub fn normalize_isbn(isbn: &str) -> Result<String> {
fn isbn10_to_isbn13(isbn10: &str) -> Result<String> {
if isbn10.len() != 10 {
return Err(PinakesError::InvalidData(format!(
"ISBN-10 must be 10 characters: {}",
isbn10
"ISBN-10 must be 10 characters: {isbn10}"
)));
}
@ -87,37 +110,21 @@ fn is_valid_isbn13(isbn13: &str) -> bool {
}
/// Extract ISBN from text (searches for ISBN-10 or ISBN-13 patterns)
#[must_use]
pub fn extract_isbn_from_text(text: &str) -> Option<String> {
use regex::Regex;
// Try different patterns in order of specificity
let patterns = vec![
// ISBN followed by colon or "is" with hyphens (most specific)
r"ISBN(?:-13)?(?:\s+is|:)?\s*(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)",
r"ISBN(?:-10)?(?:\s+is|:)?\s*(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])",
// ISBN with just whitespace
r"ISBN(?:-13)?\s+(\d{13})",
r"ISBN(?:-10)?\s+(\d{9}[\dXx])",
// Bare ISBN-13 with hyphens (in case "ISBN" is missing)
r"\b(\d{3}-\d{1,5}-\d{1,7}-\d{1,7}-\d)\b",
// Bare ISBN-10 with hyphens
r"\b(\d{1,5}-\d{1,7}-\d{1,7}-[\dXx])\b",
];
for pattern_str in patterns {
if let Ok(pattern) = Regex::new(pattern_str)
&& let Some(captures) = pattern.captures(text)
for pattern in ISBN_PATTERNS.iter() {
if let Some(captures) = pattern.captures(text)
&& let Some(isbn) = captures.get(1)
&& let Ok(normalized) = normalize_isbn(isbn.as_str())
{
return Some(normalized);
}
}
None
}
/// Parse author name into "Last, First" format for sorting
#[must_use]
pub fn parse_author_file_as(name: &str) -> String {
// Simple heuristic: if already contains comma, use as-is
if name.contains(',') {
@ -136,7 +143,7 @@ pub fn parse_author_file_as(name: &str) -> String {
return String::new();
};
let given_names = parts[..parts.len() - 1].join(" ");
format!("{}, {}", surname, given_names)
format!("{surname}, {given_names}")
},
}
}

View file

@ -1,6 +1,6 @@
//! MusicBrainz metadata enrichment for audio files.
//! `MusicBrainz` metadata enrichment for audio files.
use std::time::Duration;
use std::{fmt::Write as _, time::Duration};
use chrono::Utc;
use uuid::Uuid;
@ -23,14 +23,17 @@ impl Default for MusicBrainzEnricher {
}
impl MusicBrainzEnricher {
/// Create a new `MusicBrainzEnricher`.
#[must_use]
pub fn new() -> Self {
let client = reqwest::Client::builder()
.user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(5))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
Self {
client: reqwest::Client::builder()
.user_agent("Pinakes/0.1 (https://github.com/notashelf/pinakes)")
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(5))
.build()
.expect("failed to build HTTP client with configured timeouts"),
client,
base_url: "https://musicbrainz.org/ws/2".to_string(),
}
}
@ -65,7 +68,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
let mut query = format!("recording:{}", escape_lucene_query(title));
if let Some(ref artist) = item.artist {
query.push_str(&format!(" AND artist:{}", escape_lucene_query(artist)));
let _ = write!(query, " AND artist:{}", escape_lucene_query(artist));
}
let url = format!("{}/recording/", self.base_url);
@ -113,7 +116,7 @@ impl MetadataEnricher for MusicBrainzEnricher {
})?;
let recordings = json.get("recordings").and_then(|r| r.as_array());
if recordings.is_none_or(|r| r.is_empty()) {
if recordings.is_none_or(std::vec::Vec::is_empty) {
return Ok(None);
}
@ -125,11 +128,12 @@ impl MetadataEnricher for MusicBrainzEnricher {
.get("id")
.and_then(|id| id.as_str())
.map(String::from);
let score = recording
let score = (recording
.get("score")
.and_then(|s| s.as_f64())
.and_then(serde_json::Value::as_f64)
.unwrap_or(0.0)
/ 100.0;
/ 100.0)
.min(1.0);
Ok(Some(ExternalMetadata {
id: Uuid::now_v7(),

View file

@ -2,6 +2,16 @@ use std::path::Path;
use serde::{Deserialize, Serialize};
/// Quote a string field for CSV output: wrap in double-quotes and escape
/// internal double-quotes by doubling them (RFC 4180).
fn csv_quote(s: &str) -> String {
if s.contains([',', '"', '\n', '\r']) {
format!("\"{}\"", s.replace('"', "\"\""))
} else {
s.to_string()
}
}
use crate::{error::Result, jobs::ExportFormat, storage::DynStorageBackend};
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -11,6 +21,11 @@ pub struct ExportResult {
}
/// Export library data to the specified format.
///
/// # Errors
///
/// Returns an error if the storage query fails or if the output file cannot
/// be written.
pub async fn export_library(
storage: &DynStorageBackend,
format: &ExportFormat,
@ -38,27 +53,30 @@ pub async fn export_library(
album,genre,year,duration_secs,description,created_at,updated_at\n",
);
for item in &items {
csv.push_str(&format!(
"{},{},{},{:?},{},{},{},{},{},{},{},{},{},{},{}\n",
use std::fmt::Write as _;
writeln!(
csv,
"{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
item.id,
item.path.display(),
item.file_name,
item.media_type,
csv_quote(&item.path.display().to_string()),
csv_quote(&item.file_name),
csv_quote(&format!("{:?}", item.media_type)),
item.content_hash,
item.file_size,
item.title.as_deref().unwrap_or(""),
item.artist.as_deref().unwrap_or(""),
item.album.as_deref().unwrap_or(""),
item.genre.as_deref().unwrap_or(""),
csv_quote(item.title.as_deref().unwrap_or("")),
csv_quote(item.artist.as_deref().unwrap_or("")),
csv_quote(item.album.as_deref().unwrap_or("")),
csv_quote(item.genre.as_deref().unwrap_or("")),
item.year.map(|y| y.to_string()).unwrap_or_default(),
item
.duration_secs
.map(|d| d.to_string())
.unwrap_or_default(),
item.description.as_deref().unwrap_or(""),
csv_quote(item.description.as_deref().unwrap_or("")),
item.created_at,
item.updated_at,
));
)
.unwrap_or_default();
}
std::fs::write(destination, csv)?;
},

View file

@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
// Find the Info dictionary via the trailer
if let Ok(info_ref) = doc.trailer.get(b"Info") {
let info_obj = if let Ok(reference) = info_ref.as_reference() {
doc.get_object(reference).ok()
} else {
Some(info_ref)
};
let info_obj = info_ref
.as_reference()
.map_or(Some(info_ref), |reference| doc.get_object(reference).ok());
if let Some(obj) = info_obj
&& let Ok(dict) = obj.as_dict()
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
}
if let Ok(author) = dict.get(b"Author") {
let author_str = pdf_object_to_string(author);
meta.artist = author_str.clone();
meta.artist.clone_from(&author_str);
// Parse multiple authors if separated by semicolon, comma, or "and"
if let Some(authors_str) = author_str {
let author_names: Vec<String> = authors_str
book_meta.authors = authors_str
.split(&[';', ','][..])
.flat_map(|part| part.split(" and "))
.map(|name| name.trim().to_string())
.filter(|name| !name.is_empty())
.collect();
book_meta.authors = author_names
.into_iter()
.enumerate()
.map(|(pos, name)| {
let mut author = crate::model::AuthorInfo::new(name);
author.position = pos as i32;
author.position = i32::try_from(pos).unwrap_or(i32::MAX);
author
})
.collect();
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
let pages = doc.get_pages();
let page_count = pages.len();
if page_count > 0 {
book_meta.page_count = Some(page_count as i32);
book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
}
// Try to extract ISBN from first few pages
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
// Check for role in refinements
if let Some(role_ref) = item.refinement("role") {
author.role = role_ref.value.clone();
author.role.clone_from(&role_ref.value);
}
authors.push(author);
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
.map(|r| r.value.to_lowercase());
let id_type = match scheme.as_deref() {
Some("isbn") => "isbn",
Some("isbn-10") | Some("isbn10") => "isbn",
Some("isbn-13") | Some("isbn13") => "isbn13",
Some("isbn" | "isbn-10" | "isbn10") => "isbn",
Some("isbn-13" | "isbn13") => "isbn13",
Some("asin") => "asin",
Some("doi") => "doi",
_ => {
// Fallback: detect from value pattern
if item.value.len() == 10
|| item.value.len() == 13
|| item.value.contains('-') && item.value.len() < 20
// Fallback: detect from value pattern.
// ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
// hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
// Parentheses required: && binds tighter than ||.
if (item.value.len() == 10 || item.value.len() == 13)
|| (item.value.contains('-')
&& (item.value.len() == 13 || item.value.len() == 17))
{
"isbn"
} else {
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
// DjVu files contain metadata in SEXPR (S-expression) format within
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
// extract any metadata fields we can find.
// Guard against loading very large DjVu files into memory.
const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
let file_meta = std::fs::metadata(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
if file_meta.len() > MAX_DJVU_SIZE {
return Ok(ExtractedMetadata::default());
}
let data = std::fs::read(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;

View file

@ -11,7 +11,8 @@ use crate::media_type::MediaType;
pub struct MediaId(pub Uuid);
impl MediaId {
/// Creates a new media ID using UUIDv7.
/// Creates a new media ID using `UUIDv7`.
#[must_use]
pub fn new() -> Self {
Self(Uuid::now_v7())
}
@ -25,7 +26,7 @@ impl fmt::Display for MediaId {
impl Default for MediaId {
fn default() -> Self {
Self::new()
Self(uuid::Uuid::nil())
}
}
@ -35,7 +36,8 @@ pub struct ContentHash(pub String);
impl ContentHash {
/// Creates a new content hash from a hex string.
pub fn new(hex: String) -> Self {
#[must_use]
pub const fn new(hex: String) -> Self {
Self(hex)
}
}
@ -75,7 +77,7 @@ impl std::str::FromStr for StorageMode {
match s.to_lowercase().as_str() {
"external" => Ok(Self::External),
"managed" => Ok(Self::Managed),
_ => Err(format!("unknown storage mode: {}", s)),
_ => Err(format!("unknown storage mode: {s}")),
}
}
}
@ -147,11 +149,11 @@ pub struct MediaItem {
#[serde(default)]
pub storage_mode: StorageMode,
/// Original filename for uploaded files (preserved separately from
/// file_name)
/// `file_name`)
pub original_filename: Option<String>,
/// When the file was uploaded to managed storage
pub uploaded_at: Option<DateTime<Utc>>,
/// Storage key for looking up the blob (usually same as content_hash)
/// Storage key for looking up the blob (usually same as `content_hash`)
pub storage_key: Option<String>,
pub created_at: DateTime<Utc>,
@ -182,7 +184,8 @@ pub enum CustomFieldType {
}
impl CustomFieldType {
pub fn as_str(&self) -> &'static str {
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::Text => "text",
Self::Number => "number",
@ -228,7 +231,8 @@ pub enum CollectionKind {
}
impl CollectionKind {
pub fn as_str(&self) -> &'static str {
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::Manual => "manual",
Self::Virtual => "virtual",
@ -380,7 +384,8 @@ pub struct Pagination {
impl Pagination {
/// Creates a new pagination instance.
pub fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
#[must_use]
pub const fn new(offset: u64, limit: u64, sort: Option<String>) -> Self {
Self {
offset,
limit,
@ -431,7 +436,7 @@ pub struct BookMetadata {
}
/// Information about a book author.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct AuthorInfo {
pub name: String,
pub role: String,
@ -441,6 +446,7 @@ pub struct AuthorInfo {
impl AuthorInfo {
/// Creates a new author with the given name.
#[must_use]
pub fn new(name: String) -> Self {
Self {
name,
@ -451,17 +457,20 @@ impl AuthorInfo {
}
/// Sets the author's role.
#[must_use]
pub fn with_role(mut self, role: String) -> Self {
self.role = role;
self
}
#[must_use]
pub fn with_file_as(mut self, file_as: String) -> Self {
self.file_as = Some(file_as);
self
}
pub fn with_position(mut self, position: i32) -> Self {
#[must_use]
pub const fn with_position(mut self, position: i32) -> Self {
self.position = position;
self
}
@ -496,21 +505,20 @@ pub struct ReadingProgress {
impl ReadingProgress {
/// Creates a new reading progress entry.
#[must_use]
pub fn new(
media_id: MediaId,
user_id: Uuid,
current_page: i32,
total_pages: Option<i32>,
) -> Self {
let progress_percent = if let Some(total) = total_pages {
let progress_percent = total_pages.map_or(0.0, |total| {
if total > 0 {
(current_page as f64 / total as f64 * 100.0).min(100.0)
(f64::from(current_page) / f64::from(total) * 100.0).min(100.0)
} else {
0.0
}
} else {
0.0
};
});
Self {
media_id,
@ -574,7 +582,7 @@ impl std::str::FromStr for LinkType {
"wikilink" => Ok(Self::Wikilink),
"markdown_link" => Ok(Self::MarkdownLink),
"embed" => Ok(Self::Embed),
_ => Err(format!("unknown link type: {}", s)),
_ => Err(format!("unknown link type: {s}")),
}
}
}
@ -586,7 +594,7 @@ pub struct MarkdownLink {
pub source_media_id: MediaId,
/// Raw link target as written in the source (wikilink name or path)
pub target_path: String,
/// Resolved target media_id (None if unresolved)
/// Resolved target `media_id` (None if unresolved)
pub target_media_id: Option<MediaId>,
pub link_type: LinkType,
/// Display text for the link