initial commit

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I4a6b498153eccd5407510dd541b7f4816a6a6964
This commit is contained in:
raf 2026-01-30 22:05:46 +03:00
commit 6a73d11c4b
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
124 changed files with 34856 additions and 0 deletions

View file

@ -0,0 +1,192 @@
use std::path::Path;
use crate::error::{PinakesError, Result};
use crate::media_type::MediaType;
use super::{ExtractedMetadata, MetadataExtractor};
pub struct DocumentExtractor;
impl MetadataExtractor for DocumentExtractor {
fn extract(&self, path: &Path) -> Result<ExtractedMetadata> {
match MediaType::from_path(path) {
Some(MediaType::Pdf) => extract_pdf(path),
Some(MediaType::Epub) => extract_epub(path),
Some(MediaType::Djvu) => extract_djvu(path),
_ => Ok(ExtractedMetadata::default()),
}
}
fn supported_types(&self) -> &[MediaType] {
&[MediaType::Pdf, MediaType::Epub, MediaType::Djvu]
}
}
fn extract_pdf(path: &Path) -> Result<ExtractedMetadata> {
let doc = lopdf::Document::load(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("PDF load: {e}")))?;
let mut meta = ExtractedMetadata::default();
// Find the Info dictionary via the trailer
if let Ok(info_ref) = doc.trailer.get(b"Info") {
let info_obj = if let Ok(reference) = info_ref.as_reference() {
doc.get_object(reference).ok()
} else {
Some(info_ref)
};
if let Some(obj) = info_obj
&& let Ok(dict) = obj.as_dict()
{
if let Ok(title) = dict.get(b"Title") {
meta.title = pdf_object_to_string(title);
}
if let Ok(author) = dict.get(b"Author") {
meta.artist = pdf_object_to_string(author);
}
if let Ok(subject) = dict.get(b"Subject") {
meta.description = pdf_object_to_string(subject);
}
if let Ok(creator) = dict.get(b"Creator") {
meta.extra.insert(
"creator".to_string(),
pdf_object_to_string(creator).unwrap_or_default(),
);
}
if let Ok(producer) = dict.get(b"Producer") {
meta.extra.insert(
"producer".to_string(),
pdf_object_to_string(producer).unwrap_or_default(),
);
}
}
}
// Page count
let page_count = doc.get_pages().len();
if page_count > 0 {
meta.extra
.insert("page_count".to_string(), page_count.to_string());
}
Ok(meta)
}
fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
match obj {
lopdf::Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).into_owned()),
lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
_ => None,
}
}
fn extract_epub(path: &Path) -> Result<ExtractedMetadata> {
let doc = epub::doc::EpubDoc::new(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("EPUB parse: {e}")))?;
let mut meta = ExtractedMetadata {
title: doc.mdata("title").map(|item| item.value.clone()),
artist: doc.mdata("creator").map(|item| item.value.clone()),
description: doc.mdata("description").map(|item| item.value.clone()),
..Default::default()
};
if let Some(lang) = doc.mdata("language") {
meta.extra
.insert("language".to_string(), lang.value.clone());
}
if let Some(publisher) = doc.mdata("publisher") {
meta.extra
.insert("publisher".to_string(), publisher.value.clone());
}
if let Some(date) = doc.mdata("date") {
meta.extra.insert("date".to_string(), date.value.clone());
}
Ok(meta)
}
fn extract_djvu(path: &Path) -> Result<ExtractedMetadata> {
// DjVu files contain metadata in SEXPR (S-expression) format within
// ANTa/ANTz chunks, or in the DIRM chunk. We parse the raw bytes to
// extract any metadata fields we can find.
let data = std::fs::read(path)
.map_err(|e| PinakesError::MetadataExtraction(format!("DjVu read: {e}")))?;
let mut meta = ExtractedMetadata::default();
// DjVu files start with "AT&T" magic followed by FORM:DJVU or FORM:DJVM
if data.len() < 16 {
return Ok(meta);
}
// Search for metadata annotations in the file. DjVu metadata is stored
// as S-expressions like (metadata (key "value") ...) within ANTa chunks.
let content = String::from_utf8_lossy(&data);
// Look for (metadata ...) blocks
if let Some(meta_start) = content.find("(metadata") {
let remainder = &content[meta_start..];
// Extract key-value pairs like (title "Some Title")
extract_djvu_field(remainder, "title", &mut meta.title);
extract_djvu_field(remainder, "author", &mut meta.artist);
let mut desc = None;
extract_djvu_field(remainder, "subject", &mut desc);
if desc.is_none() {
extract_djvu_field(remainder, "description", &mut desc);
}
meta.description = desc;
let mut year_str = None;
extract_djvu_field(remainder, "year", &mut year_str);
if let Some(ref y) = year_str {
meta.year = y.parse().ok();
}
let mut creator = None;
extract_djvu_field(remainder, "creator", &mut creator);
if let Some(c) = creator {
meta.extra.insert("creator".to_string(), c);
}
}
// Also check for booklet-style metadata that some DjVu encoders write
// outside the metadata SEXPR
if meta.title.is_none()
&& let Some(title_start) = content.find("(bookmarks")
{
let remainder = &content[title_start..];
// First bookmark title is often the document title
if let Some(q1) = remainder.find('"') {
let after_q1 = &remainder[q1 + 1..];
if let Some(q2) = after_q1.find('"') {
let val = &after_q1[..q2];
if !val.is_empty() {
meta.title = Some(val.to_string());
}
}
}
}
Ok(meta)
}
fn extract_djvu_field(sexpr: &str, key: &str, out: &mut Option<String>) {
// Look for patterns like (key "value") in the S-expression
let pattern = format!("({key}");
if let Some(start) = sexpr.find(&pattern) {
let remainder = &sexpr[start + pattern.len()..];
// Find the quoted value
if let Some(q1) = remainder.find('"') {
let after_q1 = &remainder[q1 + 1..];
if let Some(q2) = after_q1.find('"') {
let val = &after_q1[..q2];
if !val.is_empty() {
*out = Some(val.to_string());
}
}
}
}
}