From 349b51e76cfd6aea687bf12a07f69c87fd0267a8 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sat, 21 Mar 2026 02:14:36 +0300 Subject: [PATCH] pinakes-core: initial subtitle management Signed-off-by: NotAShelf Change-Id: Id2f9b87b1cc903462539ab8ea47099696a6a6964 --- crates/pinakes-core/src/subtitles.rs | 315 ++++++++++++++++++++++++++- 1 file changed, 313 insertions(+), 2 deletions(-) diff --git a/crates/pinakes-core/src/subtitles.rs b/crates/pinakes-core/src/subtitles.rs index 5927899..51ae763 100644 --- a/crates/pinakes-core/src/subtitles.rs +++ b/crates/pinakes-core/src/subtitles.rs @@ -1,6 +1,6 @@ //! Subtitle management for video media items. -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; @@ -17,7 +17,7 @@ pub struct Subtitle { pub format: SubtitleFormat, pub file_path: Option, pub is_embedded: bool, - pub track_index: Option, + pub track_index: Option, pub offset_ms: i64, pub created_at: DateTime, } @@ -33,6 +33,23 @@ pub enum SubtitleFormat { Pgs, } +impl SubtitleFormat { + /// Returns the MIME type for this subtitle format. + pub const fn mime_type(self) -> &'static str { + match self { + Self::Srt => "application/x-subrip", + Self::Vtt => "text/vtt", + Self::Ass | Self::Ssa => "text/plain; charset=utf-8", + Self::Pgs => "application/octet-stream", + } + } + + /// Returns true if this format is binary (not UTF-8 text). + pub const fn is_binary(self) -> bool { + matches!(self, Self::Pgs) + } +} + impl std::fmt::Display for SubtitleFormat { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let s = match self { @@ -60,3 +77,297 @@ impl std::str::FromStr for SubtitleFormat { } } } + +use crate::error::{PinakesError, Result}; + +/// Information about a subtitle track embedded in a media container. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SubtitleTrackInfo { + /// Zero-based index among subtitle streams, as reported by ffprobe. + pub index: u32, + /// BCP 47 language code extracted from stream tags, if present. + pub language: Option, + /// Subtitle format derived from the codec name. + pub format: SubtitleFormat, + /// Human-readable title from stream tags, if present. + pub title: Option, +} + +/// Detects the subtitle format from a file extension. +/// +/// Returns `None` if the extension is unrecognised or absent. +pub fn detect_format(path: &Path) -> Option { + match path.extension()?.to_str()?.to_lowercase().as_str() { + "srt" => Some(SubtitleFormat::Srt), + "vtt" => Some(SubtitleFormat::Vtt), + "ass" => Some(SubtitleFormat::Ass), + "ssa" => Some(SubtitleFormat::Ssa), + "pgs" | "sup" => Some(SubtitleFormat::Pgs), + _ => None, + } +} + +/// Validates a BCP 47 language code. +/// +/// Accepts a primary tag of 2-3 letters followed by zero or more +/// hyphen-separated subtags of 2-8 alphanumeric characters each. +/// Examples: `en`, `en-US`, `zh-Hant`, `zh-Hant-TW`. +pub fn validate_language_code(lang: &str) -> bool { + static RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| { + #[expect(clippy::expect_used)] + regex::Regex::new(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$") + .expect("valid regex pattern") + }); + RE.is_match(lang) +} + +/// Lists subtitle tracks embedded in a media file using ffprobe. +/// +/// Returns an empty vec if the file has no subtitle streams. +/// +/// # Errors +/// +/// Returns `PinakesError::ExternalTool` if ffprobe is not available or +/// produces an error exit code. +pub async fn list_embedded_tracks( + media_path: &Path, +) -> Result> { + let output = tokio::process::Command::new("ffprobe") + .args([ + "-v", + "quiet", + "-print_format", + "json", + "-show_streams", + "-select_streams", + "s", + ]) + .arg(media_path) + .output() + .await + .map_err(|e| { + PinakesError::ExternalTool { + tool: "ffprobe".into(), + stderr: e.to_string(), + } + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).into_owned(); + return Err(PinakesError::ExternalTool { + tool: "ffprobe".into(), + stderr, + }); + } + + let json: serde_json::Value = serde_json::from_slice(&output.stdout) + .map_err(|e| { + PinakesError::ExternalTool { + tool: "ffprobe".into(), + stderr: format!("failed to parse output: {e}"), + } + })?; + + let streams = match json.get("streams").and_then(|s| s.as_array()) { + Some(s) => s, + None => return Ok(vec![]), + }; + + let mut tracks = Vec::new(); + for (idx, stream) in streams.iter().enumerate() { + let codec_name = stream + .get("codec_name") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + let format = match codec_name { + "subrip" => SubtitleFormat::Srt, + "webvtt" => SubtitleFormat::Vtt, + "ass" | "ssa" => SubtitleFormat::Ass, + "hdmv_pgs_subtitle" | "pgssub" => SubtitleFormat::Pgs, + _ => continue, // skip unknown codec + }; + + let tags = stream.get("tags"); + let language = tags + .and_then(|t| t.get("language")) + .and_then(|v| v.as_str()) + .map(str::to_owned); + let title = tags + .and_then(|t| t.get("title")) + .and_then(|v| v.as_str()) + .map(str::to_owned); + + tracks.push(SubtitleTrackInfo { + index: idx as u32, + language, + format, + title, + }); + } + + Ok(tracks) +} + +/// Extracts an embedded subtitle track from a media file using ffmpeg. +/// +/// The caller must ensure the output directory exists before calling this +/// function. The output format is determined by the file extension of +/// `output_path`. +/// +/// # Errors +/// +/// Returns `PinakesError::ExternalTool` if ffmpeg is not available or exits +/// with a non-zero status. +pub async fn extract_embedded_track( + media_path: &Path, + track_index: u32, + output_path: &Path, +) -> Result<()> { + let output = tokio::process::Command::new("ffmpeg") + .args(["-v", "quiet", "-i"]) + .arg(media_path) + .args(["-map", &format!("0:s:{track_index}"), "-y"]) + .arg(output_path) + .output() + .await + .map_err(|e| { + PinakesError::ExternalTool { + tool: "ffmpeg".into(), + stderr: e.to_string(), + } + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).into_owned(); + return Err(PinakesError::ExternalTool { + tool: "ffmpeg".into(), + stderr, + }); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use super::{SubtitleFormat, detect_format, validate_language_code}; + + #[test] + fn test_detect_format_srt() { + assert_eq!( + detect_format(Path::new("track.srt")), + Some(SubtitleFormat::Srt) + ); + } + + #[test] + fn test_detect_format_vtt() { + assert_eq!( + detect_format(Path::new("track.vtt")), + Some(SubtitleFormat::Vtt) + ); + } + + #[test] + fn test_detect_format_ass() { + assert_eq!( + detect_format(Path::new("track.ass")), + Some(SubtitleFormat::Ass) + ); + } + + #[test] + fn test_detect_format_ssa() { + assert_eq!( + detect_format(Path::new("track.ssa")), + Some(SubtitleFormat::Ssa) + ); + } + + #[test] + fn test_detect_format_pgs() { + assert_eq!( + detect_format(Path::new("track.pgs")), + Some(SubtitleFormat::Pgs) + ); + } + + #[test] + fn test_detect_format_sup() { + assert_eq!( + detect_format(Path::new("track.sup")), + Some(SubtitleFormat::Pgs) + ); + } + + #[test] + fn test_detect_format_unknown() { + assert_eq!(detect_format(Path::new("track.xyz")), None); + } + + #[test] + fn test_detect_format_no_extension() { + assert_eq!(detect_format(Path::new("track")), None); + } + + #[test] + fn test_detect_format_case_insensitive() { + assert_eq!( + detect_format(Path::new("track.SRT")), + Some(SubtitleFormat::Srt) + ); + assert_eq!( + detect_format(Path::new("track.VTT")), + Some(SubtitleFormat::Vtt) + ); + } + + #[test] + fn test_validate_language_code_simple() { + assert!(validate_language_code("en")); + } + + #[test] + fn test_validate_language_code_with_region() { + assert!(validate_language_code("en-US")); + } + + #[test] + fn test_validate_language_code_script() { + assert!(validate_language_code("zh-Hant")); + } + + #[test] + fn test_validate_language_code_full() { + assert!(validate_language_code("zh-Hant-TW")); + } + + #[test] + fn test_validate_language_code_empty() { + assert!(!validate_language_code("")); + } + + #[test] + fn test_validate_language_code_primary_too_long() { + assert!(!validate_language_code("toolong-tag-over-3-chars")); + } + + #[test] + fn test_validate_language_code_underscore_separator() { + assert!(!validate_language_code("en_US")); + } + + #[test] + fn test_validate_language_code_subtag_too_short() { + assert!(!validate_language_code("en-a")); + } + + #[test] + fn test_validate_language_code_three_letter_primary() { + assert!(validate_language_code("eng")); + } +}