Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: If5a21f16221f3c56a8008e139f93edc46a6a6964
358 lines
13 KiB
Rust
358 lines
13 KiB
Rust
use std::path::Path;
|
|
|
|
use crate::error::{PinakesError, Result};
|
|
use crate::media_type::{BuiltinMediaType, MediaType};
|
|
|
|
use super::{ExtractedMetadata, MetadataExtractor};
|
|
|
|
pub struct DocumentExtractor;
|
|
|
|
impl MetadataExtractor for DocumentExtractor {
|
|
fn extract(&self, path: &Path) -> Result<ExtractedMetadata> {
|
|
match MediaType::from_path(path) {
|
|
Some(MediaType::Builtin(BuiltinMediaType::Pdf)) => extract_pdf(path),
|
|
Some(MediaType::Builtin(BuiltinMediaType::Epub)) => extract_epub(path),
|
|
Some(MediaType::Builtin(BuiltinMediaType::Djvu)) => extract_djvu(path),
|
|
_ => Ok(ExtractedMetadata::default()),
|
|
}
|
|
}
|
|
|
|
fn supported_types(&self) -> Vec<MediaType> {
|
|
vec![
|
|
MediaType::Builtin(BuiltinMediaType::Pdf),
|
|
MediaType::Builtin(BuiltinMediaType::Epub),
|
|
MediaType::Builtin(BuiltinMediaType::Djvu),
|
|
]
|
|
}
|
|
}
|
|
|
|
fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
|
|
let doc = lopdf::Document::load(path)
|
|
.map_err(|e| PinakesError::MetadataExtraction(format!("PDF load: {e}")))?;
|
|
|
|
let mut meta = ExtractedMetadata::default();
|
|
let mut book_meta = crate::model::ExtractedBookMetadata::default();
|
|
|
|
// Find the Info dictionary via the trailer
|
|
if let Ok(info_ref) = doc.trailer.get(b"Info") {
|
|
let info_obj = if let Ok(reference) = info_ref.as_reference() {
|
|
doc.get_object(reference).ok()
|
|
} else {
|
|
Some(info_ref)
|
|
};
|
|
|
|
if let Some(obj) = info_obj
|
|
&& let Ok(dict) = obj.as_dict()
|
|
{
|
|
if let Ok(title) = dict.get(b"Title") {
|
|
meta.title = pdf_object_to_string(title);
|
|
}
|
|
if let Ok(author) = dict.get(b"Author") {
|
|
let author_str = pdf_object_to_string(author);
|
|
meta.artist = author_str.clone();
|
|
|
|
// Parse multiple authors if separated by semicolon, comma, or "and"
|
|
if let Some(authors_str) = author_str {
|
|
let author_names: Vec<String> = authors_str
|
|
.split(&[';', ','][..])
|
|
.flat_map(|part| part.split(" and "))
|
|
.map(|name| name.trim().to_string())
|
|
.filter(|name| !name.is_empty())
|
|
.collect();
|
|
|
|
book_meta.authors = author_names
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(pos, name)| {
|
|
let mut author = crate::model::AuthorInfo::new(name);
|
|
author.position = pos as i32;
|
|
author
|
|
})
|
|
.collect();
|
|
}
|
|
}
|
|
if let Ok(subject) = dict.get(b"Subject") {
|
|
meta.description = pdf_object_to_string(subject);
|
|
}
|
|
if let Ok(creator) = dict.get(b"Creator") {
|
|
meta.extra.insert(
|
|
"creator".to_string(),
|
|
pdf_object_to_string(creator).unwrap_or_default(),
|
|
);
|
|
}
|
|
if let Ok(producer) = dict.get(b"Producer") {
|
|
meta.extra.insert(
|
|
"producer".to_string(),
|
|
pdf_object_to_string(producer).unwrap_or_default(),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Page count
|
|
let pages = doc.get_pages();
|
|
let page_count = pages.len();
|
|
if page_count > 0 {
|
|
book_meta.page_count = Some(page_count as i32);
|
|
}
|
|
|
|
// Try to extract ISBN from first few pages
|
|
// Extract text from up to the first 5 pages and search for ISBN patterns
|
|
let mut extracted_text = String::new();
|
|
let max_pages = page_count.min(5);
|
|
|
|
for (_page_num, page_id) in pages.iter().take(max_pages) {
|
|
if let Ok(content) = doc.get_page_content(*page_id) {
|
|
// PDF content streams contain raw operators, but may have text strings
|
|
if let Ok(text) = std::str::from_utf8(&content) {
|
|
extracted_text.push_str(text);
|
|
extracted_text.push(' ');
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract ISBN from the text
|
|
if let Some(isbn) = crate::books::extract_isbn_from_text(&extracted_text)
|
|
&& let Ok(normalized) = crate::books::normalize_isbn(&isbn)
|
|
{
|
|
book_meta.isbn13 = Some(normalized);
|
|
book_meta.isbn = Some(isbn);
|
|
}
|
|
|
|
// Set format
|
|
book_meta.format = Some("pdf".to_string());
|
|
|
|
meta.book_metadata = Some(book_meta);
|
|
Ok(meta)
|
|
}
|
|
|
|
fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
|
|
match obj {
|
|
lopdf::Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).into_owned()),
|
|
lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
|
|
let mut doc = epub::doc::EpubDoc::new(path)
|
|
.map_err(|e| PinakesError::MetadataExtraction(format!("EPUB parse: {e}")))?;
|
|
|
|
let mut meta = ExtractedMetadata {
|
|
title: doc.mdata("title").map(|item| item.value.clone()),
|
|
artist: doc.mdata("creator").map(|item| item.value.clone()),
|
|
description: doc.mdata("description").map(|item| item.value.clone()),
|
|
..Default::default()
|
|
};
|
|
|
|
let mut book_meta = crate::model::ExtractedBookMetadata::default();
|
|
|
|
// Extract basic metadata
|
|
if let Some(lang) = doc.mdata("language") {
|
|
book_meta.language = Some(lang.value.clone());
|
|
}
|
|
if let Some(publisher) = doc.mdata("publisher") {
|
|
book_meta.publisher = Some(publisher.value.clone());
|
|
}
|
|
if let Some(date) = doc.mdata("date") {
|
|
// Try to parse as YYYY-MM-DD or just YYYY
|
|
if let Ok(parsed_date) = chrono::NaiveDate::parse_from_str(&date.value, "%Y-%m-%d") {
|
|
book_meta.publication_date = Some(parsed_date);
|
|
} else if let Ok(year) = date.value.parse::<i32>() {
|
|
book_meta.publication_date = chrono::NaiveDate::from_ymd_opt(year, 1, 1);
|
|
}
|
|
}
|
|
|
|
// Extract authors - iterate through all metadata items
|
|
let mut authors = Vec::new();
|
|
let mut position = 0;
|
|
for item in &doc.metadata {
|
|
if item.property == "creator" || item.property == "dc:creator" {
|
|
let mut author = crate::model::AuthorInfo::new(item.value.clone());
|
|
author.position = position;
|
|
position += 1;
|
|
|
|
// Check for file-as in refinements
|
|
if let Some(file_as_ref) = item.refinement("file-as") {
|
|
author.file_as = Some(file_as_ref.value.clone());
|
|
}
|
|
|
|
// Check for role in refinements
|
|
if let Some(role_ref) = item.refinement("role") {
|
|
author.role = role_ref.value.clone();
|
|
}
|
|
|
|
authors.push(author);
|
|
}
|
|
}
|
|
book_meta.authors = authors;
|
|
|
|
// Extract ISBNs from identifiers
|
|
let mut identifiers = std::collections::HashMap::new();
|
|
for item in &doc.metadata {
|
|
if item.property == "identifier" || item.property == "dc:identifier" {
|
|
// Try to get scheme from refinements
|
|
let scheme = item
|
|
.refinement("identifier-type")
|
|
.map(|r| r.value.to_lowercase());
|
|
|
|
let id_type = match scheme.as_deref() {
|
|
Some("isbn") => "isbn",
|
|
Some("isbn-10") | Some("isbn10") => "isbn",
|
|
Some("isbn-13") | Some("isbn13") => "isbn13",
|
|
Some("asin") => "asin",
|
|
Some("doi") => "doi",
|
|
_ => {
|
|
// Fallback: detect from value pattern
|
|
if item.value.len() == 10
|
|
|| item.value.len() == 13
|
|
|| item.value.contains('-') && item.value.len() < 20
|
|
{
|
|
"isbn"
|
|
} else {
|
|
"other"
|
|
}
|
|
}
|
|
};
|
|
|
|
// Try to normalize ISBN
|
|
if (id_type == "isbn" || id_type == "isbn13")
|
|
&& let Ok(normalized) = crate::books::normalize_isbn(&item.value)
|
|
{
|
|
book_meta.isbn13 = Some(normalized.clone());
|
|
book_meta.isbn = Some(item.value.clone());
|
|
}
|
|
|
|
identifiers
|
|
.entry(id_type.to_string())
|
|
.or_insert_with(Vec::new)
|
|
.push(item.value.clone());
|
|
}
|
|
}
|
|
book_meta.identifiers = identifiers;
|
|
|
|
// Extract Calibre series metadata by parsing the content.opf file
|
|
// Try common OPF locations
|
|
let opf_paths = vec!["OEBPS/content.opf", "content.opf", "OPS/content.opf"];
|
|
let mut opf_data = None;
|
|
for path in opf_paths {
|
|
if let Some(data) = doc.get_resource_str_by_path(path) {
|
|
opf_data = Some(data);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if let Some(opf_content) = opf_data {
|
|
// Look for <meta name="calibre:series" content="Series Name"/>
|
|
if let Some(series_start) = opf_content.find("name=\"calibre:series\"")
|
|
&& let Some(content_start) = opf_content[series_start..].find("content=\"")
|
|
{
|
|
let after_content = &opf_content[series_start + content_start + 9..];
|
|
if let Some(quote_end) = after_content.find('"') {
|
|
book_meta.series_name = Some(after_content[..quote_end].to_string());
|
|
}
|
|
}
|
|
|
|
// Look for <meta name="calibre:series_index" content="1.0"/>
|
|
if let Some(index_start) = opf_content.find("name=\"calibre:series_index\"")
|
|
&& let Some(content_start) = opf_content[index_start..].find("content=\"")
|
|
{
|
|
let after_content = &opf_content[index_start + content_start + 9..];
|
|
if let Some(quote_end) = after_content.find('"')
|
|
&& let Ok(index) = after_content[..quote_end].parse::<f64>()
|
|
{
|
|
book_meta.series_index = Some(index);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Set format
|
|
book_meta.format = Some("epub".to_string());
|
|
|
|
meta.book_metadata = Some(book_meta);
|
|
Ok(meta)
|
|
}
|
|
|
|
fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
|
|
// DjVu files contain metadata in SEXPR (S-expression) format within
|
|
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
|
|
// extract any metadata fields we can find.
|
|
let data = std::fs::read(path)
|
|
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;
|
|
|
|
let mut meta = ExtractedMetadata::default();
|
|
|
|
// DjVu files start with "AT&T" magic followed by FORM:DJVU or FORM:DJVM
|
|
if data.len() < 16 {
|
|
return Ok(meta);
|
|
}
|
|
|
|
// Search for metadata annotations in the file. DjVu metadata is stored
|
|
// as S-expressions like (metadata (key "value") ...) within ANTa chunks.
|
|
let content = String::from_utf8_lossy(&data);
|
|
|
|
// Look for (metadata ...) blocks
|
|
if let Some(meta_start) = content.find("(metadata") {
|
|
let remainder = &content[meta_start..];
|
|
// Extract key-value pairs like (title "Some Title")
|
|
extract_djvu_field(remainder, "title", &mut meta.title);
|
|
extract_djvu_field(remainder, "author", &mut meta.artist);
|
|
|
|
let mut desc = None;
|
|
extract_djvu_field(remainder, "subject", &mut desc);
|
|
if desc.is_none() {
|
|
extract_djvu_field(remainder, "description", &mut desc);
|
|
}
|
|
meta.description = desc;
|
|
|
|
let mut year_str = None;
|
|
extract_djvu_field(remainder, "year", &mut year_str);
|
|
if let Some(ref y) = year_str {
|
|
meta.year = y.parse().ok();
|
|
}
|
|
|
|
let mut creator = None;
|
|
extract_djvu_field(remainder, "creator", &mut creator);
|
|
if let Some(c) = creator {
|
|
meta.extra.insert("creator".to_string(), c);
|
|
}
|
|
}
|
|
|
|
// Also check for booklet-style metadata that some DjVu encoders write
|
|
// outside the metadata SEXPR
|
|
if meta.title.is_none()
|
|
&& let Some(title_start) = content.find("(bookmarks")
|
|
{
|
|
let remainder = &content[title_start..];
|
|
// First bookmark title is often the document title
|
|
if let Some(q1) = remainder.find('"') {
|
|
let after_q1 = &remainder[q1 + 1..];
|
|
if let Some(q2) = after_q1.find('"') {
|
|
let val = &after_q1[..q2];
|
|
if !val.is_empty() {
|
|
meta.title = Some(val.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(meta)
|
|
}
|
|
|
|
fn extract_djvu_field(sexpr: &str, key: &str, out: &mut Option<String>) {
|
|
// Look for patterns like (key "value") in the S-expression
|
|
let pattern = format!("({key}");
|
|
if let Some(start) = sexpr.find(&pattern) {
|
|
let remainder = &sexpr[start + pattern.len()..];
|
|
// Find the quoted value
|
|
if let Some(q1) = remainder.find('"') {
|
|
let after_q1 = &remainder[q1 + 1..];
|
|
if let Some(q2) = after_q1.find('"') {
|
|
let val = &after_q1[..q2];
|
|
if !val.is_empty() {
|
|
*out = Some(val.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|