pinakes-core: fix isbn regex, csv quoting, document extraction, and enrichment accuracy
Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I974959e74d2b5b5591437daa0f29291a6a6a6964
This commit is contained in:
parent
d77e5b9f2f
commit
d5be5026a7
5 changed files with 132 additions and 90 deletions
|
|
@ -36,11 +36,9 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
|
|||
|
||||
// Find the Info dictionary via the trailer
|
||||
if let Ok(info_ref) = doc.trailer.get(b"Info") {
|
||||
let info_obj = if let Ok(reference) = info_ref.as_reference() {
|
||||
doc.get_object(reference).ok()
|
||||
} else {
|
||||
Some(info_ref)
|
||||
};
|
||||
let info_obj = info_ref
|
||||
.as_reference()
|
||||
.map_or(Some(info_ref), |reference| doc.get_object(reference).ok());
|
||||
|
||||
if let Some(obj) = info_obj
|
||||
&& let Ok(dict) = obj.as_dict()
|
||||
|
|
@ -50,23 +48,19 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
|
|||
}
|
||||
if let Ok(author) = dict.get(b"Author") {
|
||||
let author_str = pdf_object_to_string(author);
|
||||
meta.artist = author_str.clone();
|
||||
meta.artist.clone_from(&author_str);
|
||||
|
||||
// Parse multiple authors if separated by semicolon, comma, or "and"
|
||||
if let Some(authors_str) = author_str {
|
||||
let author_names: Vec<String> = authors_str
|
||||
book_meta.authors = authors_str
|
||||
.split(&[';', ','][..])
|
||||
.flat_map(|part| part.split(" and "))
|
||||
.map(|name| name.trim().to_string())
|
||||
.filter(|name| !name.is_empty())
|
||||
.collect();
|
||||
|
||||
book_meta.authors = author_names
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(pos, name)| {
|
||||
let mut author = crate::model::AuthorInfo::new(name);
|
||||
author.position = pos as i32;
|
||||
author.position = i32::try_from(pos).unwrap_or(i32::MAX);
|
||||
author
|
||||
})
|
||||
.collect();
|
||||
|
|
@ -94,7 +88,7 @@ fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
|
|||
let pages = doc.get_pages();
|
||||
let page_count = pages.len();
|
||||
if page_count > 0 {
|
||||
book_meta.page_count = Some(page_count as i32);
|
||||
book_meta.page_count = Some(i32::try_from(page_count).unwrap_or(i32::MAX));
|
||||
}
|
||||
|
||||
// Try to extract ISBN from first few pages
|
||||
|
|
@ -187,7 +181,7 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
|
|||
|
||||
// Check for role in refinements
|
||||
if let Some(role_ref) = item.refinement("role") {
|
||||
author.role = role_ref.value.clone();
|
||||
author.role.clone_from(&role_ref.value);
|
||||
}
|
||||
|
||||
authors.push(author);
|
||||
|
|
@ -205,16 +199,18 @@ fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
|
|||
.map(|r| r.value.to_lowercase());
|
||||
|
||||
let id_type = match scheme.as_deref() {
|
||||
Some("isbn") => "isbn",
|
||||
Some("isbn-10") | Some("isbn10") => "isbn",
|
||||
Some("isbn-13") | Some("isbn13") => "isbn13",
|
||||
Some("isbn" | "isbn-10" | "isbn10") => "isbn",
|
||||
Some("isbn-13" | "isbn13") => "isbn13",
|
||||
Some("asin") => "asin",
|
||||
Some("doi") => "doi",
|
||||
_ => {
|
||||
// Fallback: detect from value pattern
|
||||
if item.value.len() == 10
|
||||
|| item.value.len() == 13
|
||||
|| item.value.contains('-') && item.value.len() < 20
|
||||
// Fallback: detect from value pattern.
|
||||
// ISBN-10 = 10 chars bare, ISBN-13 = 13 chars bare,
|
||||
// hyphenated ISBN-13 = 17 chars (e.g. 978-0-123-45678-9).
|
||||
// Parentheses required: && binds tighter than ||.
|
||||
if (item.value.len() == 10 || item.value.len() == 13)
|
||||
|| (item.value.contains('-')
|
||||
&& (item.value.len() == 13 || item.value.len() == 17))
|
||||
{
|
||||
"isbn"
|
||||
} else {
|
||||
|
|
@ -286,6 +282,15 @@ fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
|
|||
// DjVu files contain metadata in SEXPR (S-expression) format within
|
||||
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
|
||||
// extract any metadata fields we can find.
|
||||
|
||||
// Guard against loading very large DjVu files into memory.
|
||||
const MAX_DJVU_SIZE: u64 = 50 * 1024 * 1024; // 50 MB
|
||||
let file_meta = std::fs::metadata(path)
|
||||
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu stat: {e}")))?;
|
||||
if file_meta.len() > MAX_DJVU_SIZE {
|
||||
return Ok(ExtractedMetadata::default());
|
||||
}
|
||||
|
||||
let data = std::fs::read(path)
|
||||
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue