import * as mammoth from 'mammoth' export type TranscriptionFileType = 'docx' | 'vtt' | 'txt' | 'md' export interface ParsedTranscription { fileName: string fileType: TranscriptionFileType text: string size: number } const VTT_HEADER_RE = /^WEBVTT\s/mi const VTT_TIMING_RE = /^\d{2}:\d{2}:\d{2}\.\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}\.\d{3}/m const VTT_CUE_RE = /\d{2}:\d{2}:\d{2}\.\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}\.\d{3}/g function isVTT(text: string): boolean { return VTT_HEADER_RE.test(text) || VTT_TIMING_RE.test(text) } function parseVtt(raw: string): string { return raw .replace(VTT_HEADER_RE, '') .replace(/Kind:.*\n?/gi, '') .replace(/Language:.*\n?/gi, '') .split(/\r?\n/) .filter(line => { const trimmed = line.trim() if (!trimmed) return false if (/^\d+$/.test(trimmed)) return false if (VTT_TIMING_RE.test(trimmed)) return false return true }) .join(' ') .replace(/\s+/g, ' ') .trim() } function detectType(fileName: string): TranscriptionFileType { const ext = fileName.split('.').pop()?.toLowerCase() if (ext === 'docx') return 'docx' if (ext === 'vtt') return 'vtt' if (ext === 'md') return 'md' return 'txt' } export async function parseFile(file: File): Promise { const fileName = file.name const fileType = detectType(fileName) const size = file.size console.log(`[Alpha] Parsing file: ${fileName} (${fileType}, ${size} bytes)`) let text: string if (fileType === 'docx') { const arrayBuffer = await file.arrayBuffer() const result = await mammoth.extractRawText({ arrayBuffer }) text = result.value.trim() } else { const raw = await file.text() if (fileType === 'vtt' || isVTT(raw)) { text = parseVtt(raw) } else { text = raw.trim() } } return { fileName, fileType, text, size } }