initial commit
Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I4a6b498153eccd5407510dd541b7f4816a6a6964
This commit is contained in:
commit
6a73d11c4b
124 changed files with 34856 additions and 0 deletions
192
crates/pinakes-core/src/metadata/document.rs
Normal file
192
crates/pinakes-core/src/metadata/document.rs
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
use std::path::Path;
|
||||
|
||||
use crate::error::{PinakesError, Result};
|
||||
use crate::media_type::MediaType;
|
||||
|
||||
use super::{ExtractedMetadata, MetadataExtractor};
|
||||
|
||||
pub struct DocumentExtractor;
|
||||
|
||||
impl MetadataExtractor for DocumentExtractor {
|
||||
fn extract(&self, path: &Path) -> Result<ExtractedMetadata> {
|
||||
match MediaType::from_path(path) {
|
||||
Some(MediaType::Pdf) => extract_pdf(path),
|
||||
Some(MediaType::Epub) => extract_epub(path),
|
||||
Some(MediaType::Djvu) => extract_djvu(path),
|
||||
_ => Ok(ExtractedMetadata::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn supported_types(&self) -> &[MediaType] {
|
||||
&[MediaType::Pdf, MediaType::Epub, MediaType::Djvu]
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
|
||||
let doc = lopdf::Document::load(path)
|
||||
.map_err(|e| PinakesError::MetadataExtraction(format!("PDF load: {e}")))?;
|
||||
|
||||
let mut meta = ExtractedMetadata::default();
|
||||
|
||||
// Find the Info dictionary via the trailer
|
||||
if let Ok(info_ref) = doc.trailer.get(b"Info") {
|
||||
let info_obj = if let Ok(reference) = info_ref.as_reference() {
|
||||
doc.get_object(reference).ok()
|
||||
} else {
|
||||
Some(info_ref)
|
||||
};
|
||||
|
||||
if let Some(obj) = info_obj
|
||||
&& let Ok(dict) = obj.as_dict()
|
||||
{
|
||||
if let Ok(title) = dict.get(b"Title") {
|
||||
meta.title = pdf_object_to_string(title);
|
||||
}
|
||||
if let Ok(author) = dict.get(b"Author") {
|
||||
meta.artist = pdf_object_to_string(author);
|
||||
}
|
||||
if let Ok(subject) = dict.get(b"Subject") {
|
||||
meta.description = pdf_object_to_string(subject);
|
||||
}
|
||||
if let Ok(creator) = dict.get(b"Creator") {
|
||||
meta.extra.insert(
|
||||
"creator".to_string(),
|
||||
pdf_object_to_string(creator).unwrap_or_default(),
|
||||
);
|
||||
}
|
||||
if let Ok(producer) = dict.get(b"Producer") {
|
||||
meta.extra.insert(
|
||||
"producer".to_string(),
|
||||
pdf_object_to_string(producer).unwrap_or_default(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Page count
|
||||
let page_count = doc.get_pages().len();
|
||||
if page_count > 0 {
|
||||
meta.extra
|
||||
.insert("page_count".to_string(), page_count.to_string());
|
||||
}
|
||||
|
||||
Ok(meta)
|
||||
}
|
||||
|
||||
fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
|
||||
match obj {
|
||||
lopdf::Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).into_owned()),
|
||||
lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
|
||||
let doc = epub::doc::EpubDoc::new(path)
|
||||
.map_err(|e| PinakesError::MetadataExtraction(format!("EPUB parse: {e}")))?;
|
||||
|
||||
let mut meta = ExtractedMetadata {
|
||||
title: doc.mdata("title").map(|item| item.value.clone()),
|
||||
artist: doc.mdata("creator").map(|item| item.value.clone()),
|
||||
description: doc.mdata("description").map(|item| item.value.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if let Some(lang) = doc.mdata("language") {
|
||||
meta.extra
|
||||
.insert("language".to_string(), lang.value.clone());
|
||||
}
|
||||
if let Some(publisher) = doc.mdata("publisher") {
|
||||
meta.extra
|
||||
.insert("publisher".to_string(), publisher.value.clone());
|
||||
}
|
||||
if let Some(date) = doc.mdata("date") {
|
||||
meta.extra.insert("date".to_string(), date.value.clone());
|
||||
}
|
||||
|
||||
Ok(meta)
|
||||
}
|
||||
|
||||
fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
|
||||
// DjVu files contain metadata in SEXPR (S-expression) format within
|
||||
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
|
||||
// extract any metadata fields we can find.
|
||||
let data = std::fs::read(path)
|
||||
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;
|
||||
|
||||
let mut meta = ExtractedMetadata::default();
|
||||
|
||||
// DjVu files start with "AT&T" magic followed by FORM:DJVU or FORM:DJVM
|
||||
if data.len() < 16 {
|
||||
return Ok(meta);
|
||||
}
|
||||
|
||||
// Search for metadata annotations in the file. DjVu metadata is stored
|
||||
// as S-expressions like (metadata (key "value") ...) within ANTa chunks.
|
||||
let content = String::from_utf8_lossy(&data);
|
||||
|
||||
// Look for (metadata ...) blocks
|
||||
if let Some(meta_start) = content.find("(metadata") {
|
||||
let remainder = &content[meta_start..];
|
||||
// Extract key-value pairs like (title "Some Title")
|
||||
extract_djvu_field(remainder, "title", &mut meta.title);
|
||||
extract_djvu_field(remainder, "author", &mut meta.artist);
|
||||
|
||||
let mut desc = None;
|
||||
extract_djvu_field(remainder, "subject", &mut desc);
|
||||
if desc.is_none() {
|
||||
extract_djvu_field(remainder, "description", &mut desc);
|
||||
}
|
||||
meta.description = desc;
|
||||
|
||||
let mut year_str = None;
|
||||
extract_djvu_field(remainder, "year", &mut year_str);
|
||||
if let Some(ref y) = year_str {
|
||||
meta.year = y.parse().ok();
|
||||
}
|
||||
|
||||
let mut creator = None;
|
||||
extract_djvu_field(remainder, "creator", &mut creator);
|
||||
if let Some(c) = creator {
|
||||
meta.extra.insert("creator".to_string(), c);
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for booklet-style metadata that some DjVu encoders write
|
||||
// outside the metadata SEXPR
|
||||
if meta.title.is_none()
|
||||
&& let Some(title_start) = content.find("(bookmarks")
|
||||
{
|
||||
let remainder = &content[title_start..];
|
||||
// First bookmark title is often the document title
|
||||
if let Some(q1) = remainder.find('"') {
|
||||
let after_q1 = &remainder[q1 + 1..];
|
||||
if let Some(q2) = after_q1.find('"') {
|
||||
let val = &after_q1[..q2];
|
||||
if !val.is_empty() {
|
||||
meta.title = Some(val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(meta)
|
||||
}
|
||||
|
||||
fn extract_djvu_field(sexpr: &str, key: &str, out: &mut Option<String>) {
|
||||
// Look for patterns like (key "value") in the S-expression
|
||||
let pattern = format!("({key}");
|
||||
if let Some(start) = sexpr.find(&pattern) {
|
||||
let remainder = &sexpr[start + pattern.len()..];
|
||||
// Find the quoted value
|
||||
if let Some(q1) = remainder.find('"') {
|
||||
let after_q1 = &remainder[q1 + 1..];
|
||||
if let Some(q2) = after_q1.find('"') {
|
||||
let val = &after_q1[..q2];
|
||||
if !val.is_empty() {
|
||||
*out = Some(val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue