Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I4c4815ad145650a07f108614034d2e996a6a6964
394 lines
11 KiB
Rust
394 lines
11 KiB
Rust
use std::{
|
|
collections::{HashMap, HashSet},
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use tracing::{info, warn};
|
|
|
|
use crate::{
|
|
error::Result,
|
|
hash::compute_file_hash,
|
|
media_type::MediaType,
|
|
model::{ContentHash, MediaId},
|
|
storage::DynStorageBackend,
|
|
};
|
|
|
|
/// Report of orphaned, untracked, and moved files.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct OrphanReport {
|
|
/// Media items whose files no longer exist on disk.
|
|
pub orphaned_ids: Vec<MediaId>,
|
|
/// Files on disk that are not tracked in the database.
|
|
pub untracked_paths: Vec<PathBuf>,
|
|
/// Files that appear to have moved (same hash, different path).
|
|
pub moved_files: Vec<(MediaId, PathBuf, PathBuf)>,
|
|
}
|
|
|
|
/// Action to take when resolving orphans.
|
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum OrphanAction {
|
|
Delete,
|
|
Ignore,
|
|
}
|
|
|
|
/// Report of file integrity verification results.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct VerificationReport {
|
|
pub verified: usize,
|
|
pub mismatched: Vec<(MediaId, String, String)>,
|
|
pub missing: Vec<MediaId>,
|
|
pub errors: Vec<(MediaId, String)>,
|
|
}
|
|
|
|
/// Status of a media item's file integrity.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum IntegrityStatus {
|
|
Unverified,
|
|
Verified,
|
|
Mismatch,
|
|
Missing,
|
|
}
|
|
|
|
impl std::fmt::Display for IntegrityStatus {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::Unverified => write!(f, "unverified"),
|
|
Self::Verified => write!(f, "verified"),
|
|
Self::Mismatch => write!(f, "mismatch"),
|
|
Self::Missing => write!(f, "missing"),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::str::FromStr for IntegrityStatus {
|
|
type Err = String;
|
|
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
|
match s {
|
|
"unverified" => Ok(Self::Unverified),
|
|
"verified" => Ok(Self::Verified),
|
|
"mismatch" => Ok(Self::Mismatch),
|
|
"missing" => Ok(Self::Missing),
|
|
_ => Err(format!("unknown integrity status: {s}")),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Detect orphaned, untracked, and moved files.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `storage` - Storage backend to query
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Report containing orphaned items, untracked files, and moved files
|
|
pub async fn detect_orphans(
|
|
storage: &DynStorageBackend,
|
|
) -> Result<OrphanReport> {
|
|
let media_paths = storage.list_media_paths().await?;
|
|
let mut orphaned_ids = Vec::new();
|
|
|
|
// Build hash index: ContentHash -> Vec<(MediaId, PathBuf)>
|
|
let mut hash_index: HashMap<ContentHash, Vec<(MediaId, PathBuf)>> =
|
|
HashMap::new();
|
|
for (id, path, hash) in &media_paths {
|
|
hash_index
|
|
.entry(hash.clone())
|
|
.or_default()
|
|
.push((*id, path.clone()));
|
|
}
|
|
|
|
// Detect orphaned files (in DB but not on disk)
|
|
for (id, path, _hash) in &media_paths {
|
|
if !path.exists() {
|
|
orphaned_ids.push(*id);
|
|
}
|
|
}
|
|
|
|
// Detect moved files (orphaned items with same hash existing elsewhere)
|
|
let moved_files =
|
|
detect_moved_files(&orphaned_ids, &media_paths, &hash_index);
|
|
|
|
// Detect untracked files (on disk but not in DB)
|
|
let untracked_paths = detect_untracked_files(storage, &media_paths).await?;
|
|
|
|
info!(
|
|
orphaned = orphaned_ids.len(),
|
|
untracked = untracked_paths.len(),
|
|
moved = moved_files.len(),
|
|
total = media_paths.len(),
|
|
"orphan detection complete"
|
|
);
|
|
|
|
Ok(OrphanReport {
|
|
orphaned_ids,
|
|
untracked_paths,
|
|
moved_files,
|
|
})
|
|
}
|
|
|
|
/// Detect files that appear to have moved (same content hash, different path).
|
|
fn detect_moved_files(
|
|
orphaned_ids: &[MediaId],
|
|
media_paths: &[(MediaId, PathBuf, ContentHash)],
|
|
hash_index: &HashMap<ContentHash, Vec<(MediaId, PathBuf)>>,
|
|
) -> Vec<(MediaId, PathBuf, PathBuf)> {
|
|
let mut moved = Vec::new();
|
|
|
|
// Build lookup map for orphaned items: MediaId -> (PathBuf, ContentHash)
|
|
let orphaned_map: HashMap<MediaId, (PathBuf, ContentHash)> = media_paths
|
|
.iter()
|
|
.filter(|(id, ..)| orphaned_ids.contains(id))
|
|
.map(|(id, path, hash)| (*id, (path.clone(), hash.clone())))
|
|
.collect();
|
|
|
|
// For each orphaned item, check if there's another file with the same hash
|
|
for (orphaned_id, (old_path, hash)) in &orphaned_map {
|
|
if let Some(items_with_hash) = hash_index.get(hash) {
|
|
// Find other items with same hash that exist on disk
|
|
for (other_id, new_path) in items_with_hash {
|
|
// Skip if it's the same item
|
|
if other_id == orphaned_id {
|
|
continue;
|
|
}
|
|
|
|
// Check if the new path exists
|
|
if new_path.exists() {
|
|
moved.push((*orphaned_id, old_path.clone(), new_path.clone()));
|
|
// Only report first match (most likely candidate)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
moved
|
|
}
|
|
|
|
/// Detect files on disk that are not tracked in the database.
|
|
async fn detect_untracked_files(
|
|
storage: &DynStorageBackend,
|
|
media_paths: &[(MediaId, PathBuf, ContentHash)],
|
|
) -> Result<Vec<PathBuf>> {
|
|
// Get root directories
|
|
let roots = storage.list_root_dirs().await?;
|
|
if roots.is_empty() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
// Build set of tracked paths for fast lookup
|
|
let tracked_paths: HashSet<PathBuf> = media_paths
|
|
.iter()
|
|
.map(|(_, path, _)| path.clone())
|
|
.collect();
|
|
|
|
// Get ignore patterns (we'll need to load config somehow, for now use empty)
|
|
let ignore_patterns: Vec<String> = vec![
|
|
".*".to_string(),
|
|
"node_modules".to_string(),
|
|
"__pycache__".to_string(),
|
|
"target".to_string(),
|
|
];
|
|
|
|
// Walk filesystem for each root in parallel (limit concurrency to 4)
|
|
let mut filesystem_paths = HashSet::new();
|
|
let mut tasks = tokio::task::JoinSet::new();
|
|
|
|
for root in roots {
|
|
let ignore_patterns = ignore_patterns.clone();
|
|
tasks.spawn_blocking(move || -> Result<Vec<PathBuf>> {
|
|
let mut paths = Vec::new();
|
|
|
|
let walker = walkdir::WalkDir::new(&root)
|
|
.follow_links(false)
|
|
.into_iter()
|
|
.filter_entry(|e| {
|
|
// Skip directories that match ignore patterns
|
|
if e.file_type().is_dir() {
|
|
let name = e.file_name().to_string_lossy();
|
|
for pattern in &ignore_patterns {
|
|
if pattern.starts_with("*.")
|
|
&& let Some(ext) = pattern.strip_prefix("*.")
|
|
&& name.ends_with(ext)
|
|
{
|
|
// Extension pattern
|
|
return false;
|
|
} else if pattern.contains('*') {
|
|
// Glob pattern - simplified matching
|
|
let pattern_without_stars = pattern.replace('*', "");
|
|
if name.contains(&pattern_without_stars) {
|
|
return false;
|
|
}
|
|
} else if name.as_ref() == pattern
|
|
|| name.starts_with(&format!("{pattern}."))
|
|
{
|
|
// Exact match or starts with pattern
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
true
|
|
});
|
|
|
|
for entry in walker {
|
|
match entry {
|
|
Ok(entry) => {
|
|
let path = entry.path();
|
|
|
|
// Only process files
|
|
if !path.is_file() {
|
|
continue;
|
|
}
|
|
|
|
// Check if it's a supported media type
|
|
if MediaType::from_path(path).is_some() {
|
|
paths.push(path.to_path_buf());
|
|
}
|
|
},
|
|
Err(e) => {
|
|
warn!(error = %e, "failed to read directory entry");
|
|
},
|
|
}
|
|
}
|
|
|
|
Ok(paths)
|
|
});
|
|
}
|
|
|
|
// Collect results from all tasks
|
|
while let Some(result) = tasks.join_next().await {
|
|
match result {
|
|
Ok(Ok(paths)) => {
|
|
filesystem_paths.extend(paths);
|
|
},
|
|
Ok(Err(e)) => {
|
|
warn!(error = %e, "failed to walk directory");
|
|
},
|
|
Err(e) => {
|
|
warn!(error = %e, "task join error");
|
|
},
|
|
}
|
|
}
|
|
|
|
// Compute set difference: filesystem - tracked
|
|
let untracked: Vec<PathBuf> = filesystem_paths
|
|
.difference(&tracked_paths)
|
|
.cloned()
|
|
.collect();
|
|
|
|
Ok(untracked)
|
|
}
|
|
|
|
/// Resolve orphaned media items by deleting them from the database.
|
|
pub async fn resolve_orphans(
|
|
storage: &DynStorageBackend,
|
|
action: OrphanAction,
|
|
ids: &[MediaId],
|
|
) -> Result<u64> {
|
|
match action {
|
|
OrphanAction::Delete => {
|
|
let count = storage.batch_delete_media(ids).await?;
|
|
info!(count, "resolved orphans by deletion");
|
|
Ok(count)
|
|
},
|
|
OrphanAction::Ignore => {
|
|
info!(count = ids.len(), "orphans ignored");
|
|
Ok(0)
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Verify integrity of media files by recomputing hashes and comparing.
|
|
pub async fn verify_integrity(
|
|
storage: &DynStorageBackend,
|
|
media_ids: Option<&[MediaId]>,
|
|
) -> Result<VerificationReport> {
|
|
let all_paths = storage.list_media_paths().await?;
|
|
|
|
let paths_to_check: Vec<(MediaId, PathBuf, ContentHash)> =
|
|
if let Some(ids) = media_ids {
|
|
let id_set: std::collections::HashSet<MediaId> =
|
|
ids.iter().copied().collect();
|
|
all_paths
|
|
.into_iter()
|
|
.filter(|(id, ..)| id_set.contains(id))
|
|
.collect()
|
|
} else {
|
|
all_paths
|
|
};
|
|
|
|
let mut report = VerificationReport {
|
|
verified: 0,
|
|
mismatched: Vec::new(),
|
|
missing: Vec::new(),
|
|
errors: Vec::new(),
|
|
};
|
|
|
|
for (id, path, expected_hash) in paths_to_check {
|
|
if !path.exists() {
|
|
report.missing.push(id);
|
|
continue;
|
|
}
|
|
|
|
match compute_file_hash(&path).await {
|
|
Ok(actual_hash) => {
|
|
if actual_hash.0 == expected_hash.0 {
|
|
report.verified += 1;
|
|
} else {
|
|
report
|
|
.mismatched
|
|
.push((id, expected_hash.0.clone(), actual_hash.0));
|
|
}
|
|
},
|
|
Err(e) => {
|
|
report.errors.push((id, e.to_string()));
|
|
},
|
|
}
|
|
}
|
|
|
|
info!(
|
|
verified = report.verified,
|
|
mismatched = report.mismatched.len(),
|
|
missing = report.missing.len(),
|
|
errors = report.errors.len(),
|
|
"integrity verification complete"
|
|
);
|
|
|
|
Ok(report)
|
|
}
|
|
|
|
/// Clean up orphaned thumbnail files that don't correspond to any media item.
|
|
pub async fn cleanup_orphaned_thumbnails(
|
|
storage: &DynStorageBackend,
|
|
thumbnail_dir: &Path,
|
|
) -> Result<usize> {
|
|
let media_paths = storage.list_media_paths().await?;
|
|
let known_ids: std::collections::HashSet<String> = media_paths
|
|
.iter()
|
|
.map(|(id, ..)| id.0.to_string())
|
|
.collect();
|
|
|
|
let mut removed = 0;
|
|
|
|
if thumbnail_dir.exists() {
|
|
let entries = std::fs::read_dir(thumbnail_dir)?;
|
|
for entry in entries.flatten() {
|
|
let path = entry.path();
|
|
if let Some(stem) = path.file_stem().and_then(|s| s.to_str())
|
|
&& !known_ids.contains(stem)
|
|
{
|
|
if let Err(e) = std::fs::remove_file(&path) {
|
|
warn!(path = %path.display(), error = %e, "failed to remove orphaned thumbnail");
|
|
} else {
|
|
removed += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
info!(removed, "orphaned thumbnail cleanup complete");
|
|
Ok(removed)
|
|
}
|