pinakes/crates/pinakes-core/src/subtitles.rs
NotAShelf 349b51e76c
pinakes-core: initial subtitle management
Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: Id2f9b87b1cc903462539ab8ea47099696a6a6964
2026-03-22 17:58:36 +03:00

373 lines
9 KiB
Rust

//! Subtitle management for video media items.
use std::path::{Path, PathBuf};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::model::MediaId;
/// A subtitle track associated with a media item.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Subtitle {
pub id: Uuid,
pub media_id: MediaId,
pub language: Option<String>,
pub format: SubtitleFormat,
pub file_path: Option<PathBuf>,
pub is_embedded: bool,
pub track_index: Option<u32>,
pub offset_ms: i64,
pub created_at: DateTime<Utc>,
}
/// Supported subtitle formats.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SubtitleFormat {
Srt,
Vtt,
Ass,
Ssa,
Pgs,
}
impl SubtitleFormat {
/// Returns the MIME type for this subtitle format.
pub const fn mime_type(self) -> &'static str {
match self {
Self::Srt => "application/x-subrip",
Self::Vtt => "text/vtt",
Self::Ass | Self::Ssa => "text/plain; charset=utf-8",
Self::Pgs => "application/octet-stream",
}
}
/// Returns true if this format is binary (not UTF-8 text).
pub const fn is_binary(self) -> bool {
matches!(self, Self::Pgs)
}
}
impl std::fmt::Display for SubtitleFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Self::Srt => "srt",
Self::Vtt => "vtt",
Self::Ass => "ass",
Self::Ssa => "ssa",
Self::Pgs => "pgs",
};
write!(f, "{s}")
}
}
impl std::str::FromStr for SubtitleFormat {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s {
"srt" => Ok(Self::Srt),
"vtt" => Ok(Self::Vtt),
"ass" => Ok(Self::Ass),
"ssa" => Ok(Self::Ssa),
"pgs" => Ok(Self::Pgs),
_ => Err(format!("unknown subtitle format: {s}")),
}
}
}
use crate::error::{PinakesError, Result};
/// Information about a subtitle track embedded in a media container.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SubtitleTrackInfo {
/// Zero-based index among subtitle streams, as reported by ffprobe.
pub index: u32,
/// BCP 47 language code extracted from stream tags, if present.
pub language: Option<String>,
/// Subtitle format derived from the codec name.
pub format: SubtitleFormat,
/// Human-readable title from stream tags, if present.
pub title: Option<String>,
}
/// Detects the subtitle format from a file extension.
///
/// Returns `None` if the extension is unrecognised or absent.
pub fn detect_format(path: &Path) -> Option<SubtitleFormat> {
match path.extension()?.to_str()?.to_lowercase().as_str() {
"srt" => Some(SubtitleFormat::Srt),
"vtt" => Some(SubtitleFormat::Vtt),
"ass" => Some(SubtitleFormat::Ass),
"ssa" => Some(SubtitleFormat::Ssa),
"pgs" | "sup" => Some(SubtitleFormat::Pgs),
_ => None,
}
}
/// Validates a BCP 47 language code.
///
/// Accepts a primary tag of 2-3 letters followed by zero or more
/// hyphen-separated subtags of 2-8 alphanumeric characters each.
/// Examples: `en`, `en-US`, `zh-Hant`, `zh-Hant-TW`.
pub fn validate_language_code(lang: &str) -> bool {
static RE: std::sync::LazyLock<regex::Regex> =
std::sync::LazyLock::new(|| {
#[expect(clippy::expect_used)]
regex::Regex::new(r"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$")
.expect("valid regex pattern")
});
RE.is_match(lang)
}
/// Lists subtitle tracks embedded in a media file using ffprobe.
///
/// Returns an empty vec if the file has no subtitle streams.
///
/// # Errors
///
/// Returns `PinakesError::ExternalTool` if ffprobe is not available or
/// produces an error exit code.
pub async fn list_embedded_tracks(
media_path: &Path,
) -> Result<Vec<SubtitleTrackInfo>> {
let output = tokio::process::Command::new("ffprobe")
.args([
"-v",
"quiet",
"-print_format",
"json",
"-show_streams",
"-select_streams",
"s",
])
.arg(media_path)
.output()
.await
.map_err(|e| {
PinakesError::ExternalTool {
tool: "ffprobe".into(),
stderr: e.to_string(),
}
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
return Err(PinakesError::ExternalTool {
tool: "ffprobe".into(),
stderr,
});
}
let json: serde_json::Value = serde_json::from_slice(&output.stdout)
.map_err(|e| {
PinakesError::ExternalTool {
tool: "ffprobe".into(),
stderr: format!("failed to parse output: {e}"),
}
})?;
let streams = match json.get("streams").and_then(|s| s.as_array()) {
Some(s) => s,
None => return Ok(vec![]),
};
let mut tracks = Vec::new();
for (idx, stream) in streams.iter().enumerate() {
let codec_name = stream
.get("codec_name")
.and_then(|v| v.as_str())
.unwrap_or("");
let format = match codec_name {
"subrip" => SubtitleFormat::Srt,
"webvtt" => SubtitleFormat::Vtt,
"ass" | "ssa" => SubtitleFormat::Ass,
"hdmv_pgs_subtitle" | "pgssub" => SubtitleFormat::Pgs,
_ => continue, // skip unknown codec
};
let tags = stream.get("tags");
let language = tags
.and_then(|t| t.get("language"))
.and_then(|v| v.as_str())
.map(str::to_owned);
let title = tags
.and_then(|t| t.get("title"))
.and_then(|v| v.as_str())
.map(str::to_owned);
tracks.push(SubtitleTrackInfo {
index: idx as u32,
language,
format,
title,
});
}
Ok(tracks)
}
/// Extracts an embedded subtitle track from a media file using ffmpeg.
///
/// The caller must ensure the output directory exists before calling this
/// function. The output format is determined by the file extension of
/// `output_path`.
///
/// # Errors
///
/// Returns `PinakesError::ExternalTool` if ffmpeg is not available or exits
/// with a non-zero status.
pub async fn extract_embedded_track(
media_path: &Path,
track_index: u32,
output_path: &Path,
) -> Result<()> {
let output = tokio::process::Command::new("ffmpeg")
.args(["-v", "quiet", "-i"])
.arg(media_path)
.args(["-map", &format!("0:s:{track_index}"), "-y"])
.arg(output_path)
.output()
.await
.map_err(|e| {
PinakesError::ExternalTool {
tool: "ffmpeg".into(),
stderr: e.to_string(),
}
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
return Err(PinakesError::ExternalTool {
tool: "ffmpeg".into(),
stderr,
});
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::path::Path;
use super::{SubtitleFormat, detect_format, validate_language_code};
#[test]
fn test_detect_format_srt() {
assert_eq!(
detect_format(Path::new("track.srt")),
Some(SubtitleFormat::Srt)
);
}
#[test]
fn test_detect_format_vtt() {
assert_eq!(
detect_format(Path::new("track.vtt")),
Some(SubtitleFormat::Vtt)
);
}
#[test]
fn test_detect_format_ass() {
assert_eq!(
detect_format(Path::new("track.ass")),
Some(SubtitleFormat::Ass)
);
}
#[test]
fn test_detect_format_ssa() {
assert_eq!(
detect_format(Path::new("track.ssa")),
Some(SubtitleFormat::Ssa)
);
}
#[test]
fn test_detect_format_pgs() {
assert_eq!(
detect_format(Path::new("track.pgs")),
Some(SubtitleFormat::Pgs)
);
}
#[test]
fn test_detect_format_sup() {
assert_eq!(
detect_format(Path::new("track.sup")),
Some(SubtitleFormat::Pgs)
);
}
#[test]
fn test_detect_format_unknown() {
assert_eq!(detect_format(Path::new("track.xyz")), None);
}
#[test]
fn test_detect_format_no_extension() {
assert_eq!(detect_format(Path::new("track")), None);
}
#[test]
fn test_detect_format_case_insensitive() {
assert_eq!(
detect_format(Path::new("track.SRT")),
Some(SubtitleFormat::Srt)
);
assert_eq!(
detect_format(Path::new("track.VTT")),
Some(SubtitleFormat::Vtt)
);
}
#[test]
fn test_validate_language_code_simple() {
assert!(validate_language_code("en"));
}
#[test]
fn test_validate_language_code_with_region() {
assert!(validate_language_code("en-US"));
}
#[test]
fn test_validate_language_code_script() {
assert!(validate_language_code("zh-Hant"));
}
#[test]
fn test_validate_language_code_full() {
assert!(validate_language_code("zh-Hant-TW"));
}
#[test]
fn test_validate_language_code_empty() {
assert!(!validate_language_code(""));
}
#[test]
fn test_validate_language_code_primary_too_long() {
assert!(!validate_language_code("toolong-tag-over-3-chars"));
}
#[test]
fn test_validate_language_code_underscore_separator() {
assert!(!validate_language_code("en_US"));
}
#[test]
fn test_validate_language_code_subtag_too_short() {
assert!(!validate_language_code("en-a"));
}
#[test]
fn test_validate_language_code_three_letter_primary() {
assert!(validate_language_code("eng"));
}
}