pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
This commit is contained in:
raf 2026-03-08 00:42:01 +03:00
commit d5be5026a7
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
5 changed files with 132 additions and 90 deletions

View file

@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
// Find the Info dictionary via the trailer
if let Ok(info_ref) = doc.trailer.get(b"Info") {
let info_obj = if let Ok(reference) = info_ref.as_reference() {
doc.get_object(reference).ok()
} else {
Some(info_ref)
};
let info_obj = info_ref
.as_reference()
.map_or(Some(info_ref), |reference| doc.get_object(reference).ok());
if let Some(obj) = info_obj
&& let Ok(dict) = obj.as_dict()
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
}
if let Ok(author) = dict.get(b"Author") {
let author_str = pdf_object_to_string(author);
meta.artist = author_str.clone();
meta.artist.clone_from(&author_str);
// Parse multiple authors if separated by semicolon, comma, or "and"
if let Some(authors_str) = author_str {
let author_names: Vec<String> = authors_str
book_meta.authors = authors_str
.split(&[';', ','][..])
.flat_map(|part| part.split(" and "))
.map(|name| name.trim().to_string())
.filter(|name| !name.is_empty())
.collect();
book_meta.authors = author_names
.into_iter()
.enumerate()
.map(|(pos, name)| {
let mut author = crate::model::AuthorInfo::new(name);
author.position = pos as i32;
author.position = i32::try_from(pos).unwrap_or(i32::MAX);
author
})
.collect();
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
let pages = doc.get_pages();
let page_count = pages.len();
if page_count > 0 {
book_meta.page_count = Some(page_count as i32);
book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
}
// Try to extract ISBN from first few pages
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
// Check for role in refinements
if let Some(role_ref) = item.refinement("role") {
author.role = role_ref.value.clone();
author.role.clone_from(&role_ref.value);
}
authors.push(author);
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
.map(|r| r.value.to_lowercase());
let id_type = match scheme.as_deref() {
Some("isbn") => "isbn",
Some("isbn-10") | Some("isbn10") => "isbn",
Some("isbn-13") | Some("isbn13") => "isbn13",
Some("isbn" | "isbn-10" | "isbn10") => "isbn",
Some("isbn-13" | "isbn13") => "isbn13",
Some("asin") => "asin",
Some("doi") => "doi",
_ => {
// Fallback: detect from value pattern
if item.value.len() == 10
|| item.value.len() == 13
|| item.value.contains('-') && item.value.len() < 20
// Fallback: detect from value pattern.
// ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
// hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
// Parentheses required: && binds tighter than ||.
if (item.value.len() == 10 || item.value.len() == 13)
|| (item.value.contains('-')
&& (item.value.len() == 13 || item.value.len() == 17))
{
"isbn"
} else {
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
// DjVu files contain metadata in SEXPR (S-expression) format within
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
// extract any metadata fields we can find.
// Guard against loading very large DjVu files into memory.
const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
let file_meta = std::fs::metadata(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
if file_meta.len() > MAX_DJVU_SIZE {
return Ok(ExtractedMetadata::default());
}
let data = std::fs::read(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;