use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use crate::error::Result; use crate::hash::compute_file_hash; use crate::media_type::MediaType; use crate::model::{ContentHash, MediaId}; use crate::storage::DynStorageBackend; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct OrphanReport { /// Media items whose files no longer exist on disk. pub orphaned_ids: Vec, /// Files on disk that are not tracked in the database. pub untracked_paths: Vec, /// Files that appear to have moved (same hash, different path). pub moved_files: Vec<(MediaId, PathBuf, PathBuf)>, } #[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum OrphanAction { Delete, Ignore, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VerificationReport { pub verified: usize, pub mismatched: Vec<(MediaId, String, String)>, pub missing: Vec, pub errors: Vec<(MediaId, String)>, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum IntegrityStatus { Unverified, Verified, Mismatch, Missing, } impl std::fmt::Display for IntegrityStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Unverified => write!(f, "unverified"), Self::Verified => write!(f, "verified"), Self::Mismatch => write!(f, "mismatch"), Self::Missing => write!(f, "missing"), } } } impl std::str::FromStr for IntegrityStatus { type Err = String; fn from_str(s: &str) -> std::result::Result { match s { "unverified" => Ok(Self::Unverified), "verified" => Ok(Self::Verified), "mismatch" => Ok(Self::Mismatch), "missing" => Ok(Self::Missing), _ => Err(format!("unknown integrity status: {s}")), } } } /// Detect orphaned media items (files that no longer exist on disk), /// untracked files (files on disk not in database), and moved files (same hash, different path). pub async fn detect_orphans(storage: &DynStorageBackend) -> Result { let media_paths = storage.list_media_paths().await?; let mut orphaned_ids = Vec::new(); // Build hash index: ContentHash -> Vec<(MediaId, PathBuf)> let mut hash_index: HashMap> = HashMap::new(); for (id, path, hash) in &media_paths { hash_index .entry(hash.clone()) .or_default() .push((*id, path.clone())); } // Detect orphaned files (in DB but not on disk) for (id, path, _hash) in &media_paths { if !path.exists() { orphaned_ids.push(*id); } } // Detect moved files (orphaned items with same hash existing elsewhere) let moved_files = detect_moved_files(&orphaned_ids, &media_paths, &hash_index); // Detect untracked files (on disk but not in DB) let untracked_paths = detect_untracked_files(storage, &media_paths).await?; info!( orphaned = orphaned_ids.len(), untracked = untracked_paths.len(), moved = moved_files.len(), total = media_paths.len(), "orphan detection complete" ); Ok(OrphanReport { orphaned_ids, untracked_paths, moved_files, }) } /// Detect files that appear to have moved (same content hash, different path). fn detect_moved_files( orphaned_ids: &[MediaId], media_paths: &[(MediaId, PathBuf, ContentHash)], hash_index: &HashMap>, ) -> Vec<(MediaId, PathBuf, PathBuf)> { let mut moved = Vec::new(); // Build lookup map for orphaned items: MediaId -> (PathBuf, ContentHash) let orphaned_map: HashMap = media_paths .iter() .filter(|(id, _, _)| orphaned_ids.contains(id)) .map(|(id, path, hash)| (*id, (path.clone(), hash.clone()))) .collect(); // For each orphaned item, check if there's another file with the same hash for (orphaned_id, (old_path, hash)) in &orphaned_map { if let Some(items_with_hash) = hash_index.get(hash) { // Find other items with same hash that exist on disk for (other_id, new_path) in items_with_hash { // Skip if it's the same item if other_id == orphaned_id { continue; } // Check if the new path exists if new_path.exists() { moved.push((*orphaned_id, old_path.clone(), new_path.clone())); // Only report first match (most likely candidate) break; } } } } moved } /// Detect files on disk that are not tracked in the database. async fn detect_untracked_files( storage: &DynStorageBackend, media_paths: &[(MediaId, PathBuf, ContentHash)], ) -> Result> { // Get root directories let roots = storage.list_root_dirs().await?; if roots.is_empty() { return Ok(Vec::new()); } // Build set of tracked paths for fast lookup let tracked_paths: HashSet = media_paths .iter() .map(|(_, path, _)| path.clone()) .collect(); // Get ignore patterns (we'll need to load config somehow, for now use empty) let ignore_patterns: Vec = vec![ ".*".to_string(), "node_modules".to_string(), "__pycache__".to_string(), "target".to_string(), ]; // Walk filesystem for each root in parallel (limit concurrency to 4) let mut filesystem_paths = HashSet::new(); let mut tasks = tokio::task::JoinSet::new(); for root in roots { let ignore_patterns = ignore_patterns.clone(); tasks.spawn_blocking(move || -> Result> { let mut paths = Vec::new(); let walker = walkdir::WalkDir::new(&root) .follow_links(false) .into_iter() .filter_entry(|e| { // Skip directories that match ignore patterns if e.file_type().is_dir() { let name = e.file_name().to_string_lossy(); for pattern in &ignore_patterns { if pattern.starts_with("*.") && let Some(ext) = pattern.strip_prefix("*.") && name.ends_with(ext) { // Extension pattern return false; } else if pattern.contains('*') { // Glob pattern - simplified matching let pattern_without_stars = pattern.replace('*', ""); if name.contains(&pattern_without_stars) { return false; } } else if name.as_ref() == pattern || name.starts_with(&format!("{pattern}.")) { // Exact match or starts with pattern return false; } } } true }); for entry in walker { match entry { Ok(entry) => { let path = entry.path(); // Only process files if !path.is_file() { continue; } // Check if it's a supported media type if MediaType::from_path(path).is_some() { paths.push(path.to_path_buf()); } } Err(e) => { warn!(error = %e, "failed to read directory entry"); } } } Ok(paths) }); } // Collect results from all tasks while let Some(result) = tasks.join_next().await { match result { Ok(Ok(paths)) => { filesystem_paths.extend(paths); } Ok(Err(e)) => { warn!(error = %e, "failed to walk directory"); } Err(e) => { warn!(error = %e, "task join error"); } } } // Compute set difference: filesystem - tracked let untracked: Vec = filesystem_paths .difference(&tracked_paths) .cloned() .collect(); Ok(untracked) } /// Resolve orphaned media items by deleting them from the database. pub async fn resolve_orphans( storage: &DynStorageBackend, action: OrphanAction, ids: &[MediaId], ) -> Result { match action { OrphanAction::Delete => { let count = storage.batch_delete_media(ids).await?; info!(count, "resolved orphans by deletion"); Ok(count) } OrphanAction::Ignore => { info!(count = ids.len(), "orphans ignored"); Ok(0) } } } /// Verify integrity of media files by recomputing hashes and comparing. pub async fn verify_integrity( storage: &DynStorageBackend, media_ids: Option<&[MediaId]>, ) -> Result { let all_paths = storage.list_media_paths().await?; let paths_to_check: Vec<(MediaId, PathBuf, ContentHash)> = if let Some(ids) = media_ids { let id_set: std::collections::HashSet = ids.iter().copied().collect(); all_paths .into_iter() .filter(|(id, _, _)| id_set.contains(id)) .collect() } else { all_paths }; let mut report = VerificationReport { verified: 0, mismatched: Vec::new(), missing: Vec::new(), errors: Vec::new(), }; for (id, path, expected_hash) in paths_to_check { if !path.exists() { report.missing.push(id); continue; } match compute_file_hash(&path).await { Ok(actual_hash) => { if actual_hash.0 == expected_hash.0 { report.verified += 1; } else { report .mismatched .push((id, expected_hash.0.clone(), actual_hash.0)); } } Err(e) => { report.errors.push((id, e.to_string())); } } } info!( verified = report.verified, mismatched = report.mismatched.len(), missing = report.missing.len(), errors = report.errors.len(), "integrity verification complete" ); Ok(report) } /// Clean up orphaned thumbnail files that don't correspond to any media item. pub async fn cleanup_orphaned_thumbnails( storage: &DynStorageBackend, thumbnail_dir: &Path, ) -> Result { let media_paths = storage.list_media_paths().await?; let known_ids: std::collections::HashSet = media_paths .iter() .map(|(id, _, _)| id.0.to_string()) .collect(); let mut removed = 0; if thumbnail_dir.exists() { let entries = std::fs::read_dir(thumbnail_dir)?; for entry in entries.flatten() { let path = entry.path(); if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) && !known_ids.contains(stem) { if let Err(e) = std::fs::remove_file(&path) { warn!(path = %path.display(), error = %e, "failed to remove orphaned thumbnail"); } else { removed += 1; } } } } info!(removed, "orphaned thumbnail cleanup complete"); Ok(removed) }