diff --git a/Sources/macSubtitleOCR/MKV/EBML/EBMLParser.swift b/Sources/macSubtitleOCR/MKV/EBML/EBMLParser.swift index 54e6e64..7ab0a1e 100644 --- a/Sources/macSubtitleOCR/MKV/EBML/EBMLParser.swift +++ b/Sources/macSubtitleOCR/MKV/EBML/EBMLParser.swift @@ -26,8 +26,6 @@ func readVINT(from fileHandle: FileHandle, unmodified: Bool = false) -> UInt64 { } // Extract the value - logger.debug("Length: \(length), Mask: 0x\(String(format: "%08X", mask))") - logger.debug("Length: \(length), Mask - 1: 0x\(String(format: "%08X", mask - 1))") if mask - 1 == 0x0F { mask = 0xFF // Hacky workaround that I still don't understand why is needed } else if length == 1, !unmodified { @@ -35,8 +33,6 @@ func readVINT(from fileHandle: FileHandle, unmodified: Bool = false) -> UInt64 { } else { mask = mask - 1 } - logger.debug("Byte before: 0x\(String(format: "%08X", firstByte))") - logger.debug("Byte after: 0x\(String(format: "%08X", firstByte & mask))") var value = UInt64(firstByte & mask) @@ -61,6 +57,5 @@ func readBytes(from fileHandle: FileHandle, length: Int) -> Data? { func readEBMLElement(from fileHandle: FileHandle, unmodified: Bool = false) -> (elementID: UInt32, elementSize: UInt64) { let elementID = readVINT(from: fileHandle, unmodified: unmodified) let elementSize = readVINT(from: fileHandle, unmodified: true) - logger.debug("elementID: 0x\(String(format: "%08X", elementID)), elementSize: \(elementSize)") return (UInt32(elementID), elementSize) } diff --git a/Sources/macSubtitleOCR/MKV/MKV.swift b/Sources/macSubtitleOCR/MKV/MKV.swift deleted file mode 100644 index 393d30d..0000000 --- a/Sources/macSubtitleOCR/MKV/MKV.swift +++ /dev/null @@ -1,287 +0,0 @@ -// -// MKVParser.swift -// macSubtitleOCR -// -// Created by Ethan Dye on 9/16/24. -// Copyright © 2024 Ethan Dye. All rights reserved. -// - -import Foundation -import os - -class MKV { - // MARK: - Properties - - private var eof: UInt64 - private var fileHandle: FileHandle - private var stderr = StandardErrorOutputStream() - private var timestampScale: Double = 1000000.0 // Default value if not specified in a given MKV file - private var tracks = [MKVTrack]() - private let logger = Logger(subsystem: "github.ecdye.macSubtitleOCR", category: "mkv") - - // MARK: - Lifecycle - - init(filePath: String) throws { - // guard FileManager.default.fileExists(atPath: filePath) else { - // print("Error: file '\(filePath)' does not exist", to: &stderr) - // throw macSubtitleOCRError.fileReadError - // } - - fileHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: filePath)) - eof = fileHandle.seekToEndOfFile() - fileHandle.seek(toFileOffset: 0) - try parseTracks(codec: "S_HDMV/PGS") // TODO: add future codecs as support is added - fileHandle.closeFile() - } - - // MARK: - Getters - - func getTracks() -> [MKVTrack] { - tracks - } - - // MARK: - Functions - - // Parse the EBML structure and find the Tracks section - func parseTracks(codec: String) throws { - guard let _ = findElement(withID: EBML.segmentID) as? (UInt64, UInt32) else { - print("Error: Segment element not found", to: &stderr) - throw MKVError.segmentElementNotFound - } - - guard let (tracksSize, _) = findElement(withID: EBML.tracksID) as? (UInt64, UInt32) else { - print("Error: Tracks element not found", to: &stderr) - throw MKVError.tracksElementNotFound - } - let endOfTracksOffset = fileHandle.offsetInFile + tracksSize - - var trackNums = [Int]() - while fileHandle.offsetInFile < endOfTracksOffset { - if let (elementID, elementSize, _) = tryParseElement() { - if elementID == EBML.trackEntryID { - logger.debug("Found TrackEntry element") - if let track = parseTrackEntry(codec: codec) { - trackNums.append(track) - } - } else if elementID == EBML.chapters { - break - } else { - fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) - } - } - } - let trackData = extractTrackData(trackNumber: trackNums) - trackData?.enumerated().forEach { index, data in - tracks.append(MKVTrack(trackNumber: index, codecId: codec, trackData: data)) - } - } - - func getSubtitleTrackData(trackNumber: Int, outPath: String) throws -> String? { - let tmpSup = URL(fileURLWithPath: outPath).deletingPathExtension().appendingPathExtension("sup") - .lastPathComponent - - - let manager = FileManager.default - let tmpFilePath = (manager.temporaryDirectory.path + "/\(trackNumber)" + tmpSup) - if manager.createFile(atPath: tmpFilePath, contents: tracks[trackNumber].trackData, attributes: nil) { - logger.debug("Created file at path: \(tmpFilePath).") - return tmpFilePath - } else { - logger.debug("Failed to create file at path: \(tmpFilePath).") - throw PGSError.fileReadError - } - } - - // MARK: - Methods - - // Function to seek to the track bytestream for a specific track number and extract all blocks - private func extractTrackData(trackNumber: [Int]) -> [Data]? { - fileHandle.seek(toFileOffset: 0) - - // Step 1: Locate the Segment element - if let (segmentSize, _) = findElement(withID: EBML.segmentID) as? (UInt64, UInt32) { - let segmentEndOffset = fileHandle.offsetInFile + segmentSize - // swiftformat:disable:next redundantSelf - logger.debug("Found Segment, Size: \(segmentSize), End Offset: \(segmentEndOffset), EOF: \(self.eof)") - var trackData = [Data](repeating: Data(), count: trackNumber.count) - - // Step 2: Parse Clusters within the Segment - while fileHandle.offsetInFile < segmentEndOffset { - if let (clusterSize, _) = findElement(withID: EBML.cluster, avoidCluster: false) as? ( - UInt64, - UInt32) - { - let clusterEndOffset = fileHandle.offsetInFile + clusterSize - // logger.debug("Found Cluster, Size: \(clusterSize), End Offset: \(clusterEndOffset)\n") - - // Step 3: Extract the cluster timestamp - guard let clusterTimestamp = extractClusterTimestamp() else { - logger.warning("Failed to extract cluster timestamp, skipping cluster.") - continue - } - // logger.debug("Cluster Timestamp: \(clusterTimestamp)") - - // Step 4: Parse Blocks (SimpleBlock or Block) within each Cluster - while fileHandle.offsetInFile < clusterEndOffset { - // swiftformat:disable:next redundantSelf - logger.debug("Looking for Block at Offset: \(self.fileHandle.offsetInFile)/\(clusterEndOffset)") - if let (blockSize, blockType) = findElement(withID: EBML.simpleBlock, EBML.blockGroup) as? (UInt64, UInt32) { - var blockStartOffset = fileHandle.offsetInFile - var blockSize = blockSize - - if blockType == EBML.blockGroup { - guard let (ns, _) = findElement(withID: EBML.block) as? (UInt64, UInt32) - else { return nil } - blockSize = ns - blockStartOffset = fileHandle.offsetInFile - } - - // Step 5: Read the track number in the block and compare it - if let (blockTrackNumber, blockTimestamp) = readTrackNumber(from: fileHandle) as? (UInt64, Int64) { - if trackNumber.contains(Int(blockTrackNumber)) { - // Step 6: Calculate and encode the timestamp as 4 bytes in big-endian - // (PGS format) - let absPTS = calcAbsPTSForPGS( - clusterTimestamp, - blockTimestamp, - timestampScale) - let pgsPTS = encodePTSForPGS(absPTS) - - // Step 7: Read the block data and add needed PGS headers and timestamps - let pgsHeader = Data([0x50, 0x47] + pgsPTS + [0x00, 0x00, 0x00, 0x00]) - var blockData = Data() - let raw = fileHandle - .readData(ofLength: Int(blockSize - - (fileHandle.offsetInFile - blockStartOffset))) - var offset = 0 - while (offset + 3) <= raw.count { - let segmentSize = min( - Int(getUInt16BE(buffer: raw, offset: offset + 1) + 3), - raw.count - offset) - logger.debug("Segment size \(segmentSize) at \(offset) type 0x\(String(format: "%02x", raw[offset]))") - - blockData.append(pgsHeader) - blockData.append(raw.subdata(in: offset ..< segmentSize + offset)) - offset += segmentSize - } - - trackData[trackNumber.firstIndex{ $0 == Int(blockTrackNumber) }!].append(blockData) - } else { - // Skip this block if it's for a different track - // swiftformat:disable:next redundantSelf - // logger.debug("Skipping Block at Offset: \(self.fileHandle.offsetInFile)/\(clusterEndOffset)") - // logger.debug("Got Track Number: \(blockTrackNumber) looking for: \(trackNumber)") - fileHandle.seek(toFileOffset: blockStartOffset + blockSize) - } - } - } else { - break // No more blocks found in this cluster - } - } - } else { - break // No more clusters found in the segment - } - } - - return trackData.isEmpty ? nil : trackData - } - - return nil - } - - // Extract the cluster timestamp - private func extractClusterTimestamp() -> Int64? { - if let (timestampElementSize, _) = findElement(withID: EBML.timestamp) as? (UInt64, UInt32) { - return readFixedLengthNumber(fileHandle: fileHandle, length: Int(timestampElementSize)) - } - return nil - } - - // Function to read the track number, timestamp, and lacing type (if any) from a Block or SimpleBlock header - private func readTrackNumber(from fileHandle: FileHandle) -> (UInt64?, Int64) { - let trackNumber = readVINT(from: fileHandle, unmodified: true) - let timestamp = readFixedLengthNumber(fileHandle: fileHandle, length: 2) - let suffix = fileHandle.readData(ofLength: 1).first ?? 0 - - let lacingFlag = (suffix >> 1) & 0x03 // Bits 1 and 2 are the lacing type (unused by us, kept for debugging) - logger.debug("Track number: \(trackNumber), Timestamp: \(timestamp), Lacing type: \(lacingFlag)") - return (trackNumber, timestamp) - } - - // Find EBML element by ID, avoiding Cluster header - private func findElement(withID targetID: UInt32, _ tgtID2: UInt32? = nil, avoidCluster: Bool = true) -> (UInt64?, UInt32?) { - while let (elementID, elementSize, elementOffset) = tryParseElement() { - // Ensure we stop if we have reached or passed the EOF - if fileHandle.offsetInFile >= eof { - return (nil, nil) - } - - // If, by chance, we find a TimestampScale element, update it from the default - if elementID == EBML.timestampScale { - timestampScale = Double(readFixedLengthNumber( - fileHandle: fileHandle, - length: Int(elementSize))) - // swiftformat:disable:next redundantSelf - logger.debug("Found timestamp scale: \(self.timestampScale)") - return (nil, nil) - } - - // If a Cluster header is encountered, seek back to the start of the Cluster - if elementID == EBML.cluster && avoidCluster { - logger.debug("Encountered Cluster: seeking back to before the cluster header") - fileHandle.seek(toFileOffset: elementOffset) - return (nil, nil) - } - - // If the element matches the target ID (or secondary ID), return its size - if elementID == targetID || (tgtID2 != nil && elementID == tgtID2!) { - return (elementSize, elementID) - } else { - // Skip over the element's data by seeking to its end - logger.debug("Found: \(elementID), but not \(targetID), skipping element") - fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) - } - } - - return (nil, nil) - } - - // Parse TrackEntry and return MKVTrack object - private func parseTrackEntry(codec: String) -> Int? { - var trackNumber: Int? - var trackType: UInt8? - var codecId: String? - - while let (elementID, elementSize, _) = tryParseElement() { - switch elementID { - case EBML.trackNumberID: - trackNumber = Int((readBytes(from: fileHandle, length: 1)?.first)!) - logger.debug("Found track number: \(trackNumber!)") - case EBML.trackTypeID: // Unused by us, left for debugging - trackType = readBytes(from: fileHandle, length: 1)?.first - logger.debug("Found track type: \(trackType!)") - case EBML.codecID: - var data = readBytes(from: fileHandle, length: Int(elementSize)) - data?.removeNullBytes() - codecId = data.flatMap { String(data: $0, encoding: .ascii) } - logger.debug("Found codec ID: \(codecId!)") - default: - fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) - } - if trackNumber != nil, trackType != nil, codecId != nil { break } - } - - if let trackNumber, let codecId { - if codecId == codec { - return trackNumber - } - } - return nil - } - - private func tryParseElement() -> (elementID: UInt32, elementSize: UInt64, oldOffset: UInt64)? { - let oldOffset = fileHandle.offsetInFile - let (elementID, elementSize) = readEBMLElement(from: fileHandle) - return (elementID, elementSize, oldOffset: oldOffset) - } -} diff --git a/Sources/macSubtitleOCR/MKV/MKVFileHandler.swift b/Sources/macSubtitleOCR/MKV/MKVFileHandler.swift new file mode 100644 index 0000000..6c3eb20 --- /dev/null +++ b/Sources/macSubtitleOCR/MKV/MKVFileHandler.swift @@ -0,0 +1,80 @@ +import Foundation +import os + +class MKVFileHandler: MKVFileHandling { + var fileHandle: FileHandle + var eof: UInt64 + var timestampScale: Double = 1000000.0 // Default value if not specified in a given MKV file + var logger = Logger(subsystem: "github.ecdye.macSubtitleOCR", category: "mkv") + + init(filePath: String) throws { + guard FileManager.default.fileExists(atPath: filePath) else { + throw macSubtitleOCRError.fileReadError + } + self.fileHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: filePath)) + self.eof = fileHandle.seekToEndOfFile() + fileHandle.seek(toFileOffset: 0) + } + + deinit { + fileHandle.closeFile() + } + + func locateSegment() -> UInt64? { + if let (segmentSize, _) = findElement(withID: EBML.segmentID, avoidCluster: true) as? (UInt64, UInt32) { + return segmentSize + } + return nil + } + + func locateCluster() -> UInt64? { + if let (clusterSize, _) = findElement(withID: EBML.cluster, avoidCluster: false) as? (UInt64, UInt32) { + return clusterSize + } + return nil + } + + // Find EBML element by ID, avoiding Cluster header + func findElement(withID targetID: UInt32, _ tgtID2: UInt32? = nil, avoidCluster: Bool = true) -> (UInt64?, UInt32?) { + while let (elementID, elementSize, elementOffset) = tryParseElement() { + // Ensure we stop if we have reached or passed the EOF + if fileHandle.offsetInFile >= eof { + return (nil, nil) + } + + // If, by chance, we find a TimestampScale element, update it from the default + if elementID == EBML.timestampScale { + timestampScale = Double(readFixedLengthNumber( + fileHandle: fileHandle, + length: Int(elementSize))) + // swiftformat:disable:next redundantSelf + logger.debug("Found timestamp scale: \(self.timestampScale)") + return (nil, nil) + } + + // If a Cluster header is encountered, seek back to the start of the Cluster + if elementID == EBML.cluster && avoidCluster { + logger.debug("Encountered Cluster: seeking back to before the cluster header") + fileHandle.seek(toFileOffset: elementOffset) + return (nil, nil) + } + + // If the element matches the target ID (or secondary ID), return its size + if elementID == targetID || (tgtID2 != nil && elementID == tgtID2!) { + return (elementSize, elementID) + } else { + // Skip over the element's data by seeking to its end + logger.debug("Found: \(elementID), but not \(targetID), skipping element") + fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) + } + } + + return (nil, nil) + } + + func tryParseElement() -> (elementID: UInt32, elementSize: UInt64, oldOffset: UInt64)? { + let oldOffset = fileHandle.offsetInFile + let (elementID, elementSize) = readEBMLElement(from: fileHandle) + return (elementID, elementSize, oldOffset: oldOffset) + } +} diff --git a/Sources/macSubtitleOCR/MKV/MKVProtocol.swift b/Sources/macSubtitleOCR/MKV/MKVProtocol.swift new file mode 100644 index 0000000..dffcc77 --- /dev/null +++ b/Sources/macSubtitleOCR/MKV/MKVProtocol.swift @@ -0,0 +1,28 @@ +// +// mkvProtocol.swift +// macSubtitleOCR +// +// Created by Ethan Dye on 9/20/24. +// Copyright © 2024 Ethan Dye. All rights reserved. +// + +import Foundation + +protocol MKVFileHandling { + var fileHandle: FileHandle { get } + var eof: UInt64 { get } + var timestampScale: Double { get set } + + func locateSegment() -> UInt64? + func locateCluster() -> UInt64? + func findElement(withID targetID: UInt32, _ tgtID2: UInt32?, avoidCluster: Bool) -> (UInt64?, UInt32?) +} + +protocol MKVSubtitleExtracting { + func getSubtitleTrackData(trackNumber: Int, outPath: String) throws -> String? +} + +protocol MKVTrackParsing { + var tracks: [MKVTrack] { get set } + func parseTracks(codec: String) throws +} diff --git a/Sources/macSubtitleOCR/MKV/MKVSubtitleExtractor.swift b/Sources/macSubtitleOCR/MKV/MKVSubtitleExtractor.swift new file mode 100644 index 0000000..2cebf5c --- /dev/null +++ b/Sources/macSubtitleOCR/MKV/MKVSubtitleExtractor.swift @@ -0,0 +1,18 @@ +import Foundation +import os + +class MKVSubtitleExtractor: MKVTrackParser, MKVSubtitleExtracting { + func getSubtitleTrackData(trackNumber: Int, outPath: String) throws -> String? { + let tmpSup = URL(fileURLWithPath: outPath).deletingPathExtension().appendingPathExtension("sup").lastPathComponent + let manager = FileManager.default + let tmpFilePath = (manager.temporaryDirectory.path + "/\(trackNumber)" + tmpSup) + + if manager.createFile(atPath: tmpFilePath, contents: tracks[trackNumber].trackData, attributes: nil) { + logger.debug("Created file at path: \(tmpFilePath).") + return tmpFilePath + } else { + logger.debug("Failed to create file at path: \(tmpFilePath).") + throw PGSError.fileReadError + } + } +} diff --git a/Sources/macSubtitleOCR/MKV/MKVTrackParser.swift b/Sources/macSubtitleOCR/MKV/MKVTrackParser.swift new file mode 100644 index 0000000..1c3c055 --- /dev/null +++ b/Sources/macSubtitleOCR/MKV/MKVTrackParser.swift @@ -0,0 +1,163 @@ +import Foundation +import os + +class MKVTrackParser: MKVFileHandler, MKVTrackParsing { + var tracks: [MKVTrack] = [] + private var stderr = StandardErrorOutputStream() + + func parseTracks(codec: String) throws { + guard let _ = findElement(withID: EBML.segmentID) as? (UInt64, UInt32) else { + print("Error: Segment element not found", to: &stderr) + throw MKVError.segmentElementNotFound + } + + guard let (tracksSize, _) = findElement(withID: EBML.tracksID) as? (UInt64, UInt32) else { + print("Error: Tracks element not found", to: &stderr) + throw MKVError.tracksElementNotFound + } + + let endOfTracksOffset = fileHandle.offsetInFile + tracksSize + + var trackNums = [Int]() + while fileHandle.offsetInFile < endOfTracksOffset { + if let (elementID, elementSize, _) = tryParseElement() { + if elementID == EBML.trackEntryID { + logger.debug("Found TrackEntry element") + if let track = parseTrackEntry(codec: codec) { + trackNums.append(track) + } + } else if elementID == EBML.chapters { + break + } else { + fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) + } + } + } + + let trackData = extractTrackData(trackNumber: trackNums) + trackData?.enumerated().forEach { index, data in + tracks.append(MKVTrack(trackNumber: index, codecId: codec, trackData: data)) + } + } + + private func parseTrackEntry(codec: String) -> Int? { + var trackNumber: Int? + var trackType: UInt8? + var codecId: String? + + while let (elementID, elementSize, _) = tryParseElement() { + switch elementID { + case EBML.trackNumberID: + trackNumber = Int((readBytes(from: fileHandle, length: 1)?.first)!) + logger.debug("Found track number: \(trackNumber!)") + case EBML.trackTypeID: // Unused by us, left for debugging + trackType = readBytes(from: fileHandle, length: 1)?.first + logger.debug("Found track type: \(trackType!)") + case EBML.codecID: + var data = readBytes(from: fileHandle, length: Int(elementSize)) + data?.removeNullBytes() + codecId = data.flatMap { String(data: $0, encoding: .ascii) } + logger.debug("Found codec ID: \(codecId!)") + default: + fileHandle.seek(toFileOffset: fileHandle.offsetInFile + elementSize) + } + if trackNumber != nil, trackType != nil, codecId != nil { break } + } + + if let trackNumber, let codecId { + if codecId == codec { + return trackNumber + } + } + return nil + } + + // Implement track extraction logic (e.g., `extractTrackData`) here + func extractTrackData(trackNumber: [Int]) -> [Data]? { + fileHandle.seek(toFileOffset: 0) + + // Step 1: Locate the Segment element + guard let segmentSize = locateSegment() else { return nil } + let segmentEndOffset = fileHandle.offsetInFile + segmentSize + logger.debug("Found Segment, Size: \(segmentSize), End Offset: \(segmentEndOffset), EOF: \(self.eof)") + + var trackData = [Data](repeating: Data(), count: trackNumber.count) + + // Step 2: Parse Clusters within the Segment + while fileHandle.offsetInFile < segmentEndOffset { + guard let clusterSize = locateCluster() else { continue } + let clusterEndOffset = fileHandle.offsetInFile + clusterSize + + // Step 3: Extract the cluster timestamp + guard let clusterTimestamp = extractClusterTimestamp() else { + logger.warning("Failed to extract cluster timestamp, skipping cluster.") + continue + } + + // Step 4: Parse Blocks (SimpleBlock or Block) within each Cluster + parseBlocks(within: clusterEndOffset, trackNumber: trackNumber, clusterTimestamp: clusterTimestamp, trackData: &trackData) + } + + return trackData.isEmpty ? nil : trackData + } + + private func extractClusterTimestamp() -> Int64? { + if let (timestampElementSize, _) = findElement(withID: EBML.timestamp) as? (UInt64, UInt32) { + return readFixedLengthNumber(fileHandle: fileHandle, length: Int(timestampElementSize)) + } + return nil + } + + private func parseBlocks(within clusterEndOffset: UInt64, trackNumber: [Int], clusterTimestamp: Int64, trackData: inout [Data]) { + while fileHandle.offsetInFile < clusterEndOffset { + logger.debug("Looking for Block at Offset: \(self.fileHandle.offsetInFile)/\(clusterEndOffset)") + guard case (var blockSize?, let blockType?) = findElement(withID: EBML.simpleBlock, EBML.blockGroup) else { break } + + var blockStartOffset = fileHandle.offsetInFile + + if blockType == EBML.blockGroup { + guard let (ns, _) = findElement(withID: EBML.block) as? (UInt64, UInt32) else { return } + blockSize = ns + blockStartOffset = fileHandle.offsetInFile + } + + // Step 5: Read the track number in the block and compare it + guard let (blockTrackNumber, blockTimestamp) = readTrackNumber(from: fileHandle) as? (UInt64, Int64) else { continue } + if trackNumber.contains(Int(blockTrackNumber)) { + // Step 6: Calculate and encode the timestamp as 4 bytes in big-endian (PGS format) + let absPTS = calcAbsPTSForPGS(clusterTimestamp, blockTimestamp, timestampScale) + let pgsPTS = encodePTSForPGS(absPTS) + + // Step 7: Read the block data and add needed PGS headers and timestamps + let pgsHeader = Data([0x50, 0x47] + pgsPTS + [0x00, 0x00, 0x00, 0x00]) + var blockData = Data() + let raw = fileHandle.readData(ofLength: Int(blockSize - (fileHandle.offsetInFile - blockStartOffset))) + var offset = 0 + while (offset + 3) <= raw.count { + let segmentSize = min(Int(getUInt16BE(buffer: raw, offset: offset + 1) + 3), raw.count - offset) + logger.debug("Segment size \(segmentSize) at \(offset) type 0x\(String(format: "%02x", raw[offset]))") + + blockData.append(pgsHeader) + blockData.append(raw.subdata(in: offset ..< segmentSize + offset)) + offset += segmentSize + } + + trackData[trackNumber.firstIndex { $0 == Int(blockTrackNumber) }!].append(blockData) + } else { + // Skip this block if it's for a different track + fileHandle.seek(toFileOffset: blockStartOffset + blockSize) + } + } + } + + // Function to read the track number, timestamp, and lacing type (if any) from a Block or SimpleBlock header + func readTrackNumber(from fileHandle: FileHandle) -> (UInt64?, Int64) { + let trackNumber = readVINT(from: fileHandle, unmodified: true) + let timestamp = readFixedLengthNumber(fileHandle: fileHandle, length: 2) + let suffix = fileHandle.readData(ofLength: 1).first ?? 0 + + let lacingFlag = (suffix >> 1) & 0x03 // Bits 1 and 2 are the lacing type (unused by us, kept for debugging) + logger.debug("Track number: \(trackNumber), Timestamp: \(timestamp), Lacing type: \(lacingFlag)") + return (trackNumber, timestamp) + } +} diff --git a/Sources/macSubtitleOCR/macSubtitleOCR.swift b/Sources/macSubtitleOCR/macSubtitleOCR.swift index 54676c9..7857f0c 100644 --- a/Sources/macSubtitleOCR/macSubtitleOCR.swift +++ b/Sources/macSubtitleOCR/macSubtitleOCR.swift @@ -17,7 +17,7 @@ import Vision struct macSubtitleOCR: ParsableCommand { // MARK: - Properties - @Argument(help: "Input .sup subtitle file") + @Argument(help: "Input file containing the subtitle stream (.sup or .mkv)") var sup: String @Argument(help: "Directory to output the completed .srt files to") @@ -58,101 +58,23 @@ struct macSubtitleOCR: ParsableCommand { var subIndex = 1 var jsonStream: [Any] = [] var srtStreams: [Int: SRT] = [:] + var intermediateFiles: [Int: String] = [:] if sup.hasSuffix(".mkv") { - let mkvStream = try MKV(filePath: sup) - let tracks = mkvStream.getTracks() - for track in tracks { + let mkvStream = try MKVSubtitleExtractor(filePath: sup) + try mkvStream.parseTracks(codec: "S_HDMV/PGS") + for track in mkvStream.tracks { subIndex = 1 // reset counter for each track logger.debug("Found subtitle track: \(track.trackNumber), Codec: \(track.codecId)") - let subtitleDataPath = try mkvStream.getSubtitleTrackData(trackNumber: track.trackNumber, outPath: sup)! + intermediateFiles[track.trackNumber] = try mkvStream.getSubtitleTrackData(trackNumber: track.trackNumber, outPath: sup)! // Open the PGS data stream - let PGS = try PGS(URL(fileURLWithPath: subtitleDataPath)) + let PGS = try PGS(URL(fileURLWithPath: intermediateFiles[track.trackNumber]!)) let srtStream = SRT() srtStreams[track.trackNumber] = srtStream - for subtitle in PGS.getSubtitles() { - if subtitle.imageWidth == 0, subtitle.imageHeight == 0 { - logger.debug("Skipping subtitle index \(subIndex) with empty image data!") - continue - } - - guard let subImage = PGS.createImage(index: subIndex - 1) - else { - logger.info("Could not create image for index \(subIndex)! Skipping...") - continue - } - - // Save subtitle image as PNG if imageDirectory is provided - if let imageDirectory { - let outputDirectory = URL(fileURLWithPath: imageDirectory) - do { - try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil) - } catch CocoaError.fileWriteFileExists { - // Folder already existed - } - let pngPath = outputDirectory.appendingPathComponent("subtitle_\(subIndex).png") - - try saveImageAsPNG(image: subImage, outputPath: pngPath) - } - - // Perform text recognition - let request = VNRecognizeTextRequest { request, _ in - guard let observations = request.results as? [VNRecognizedTextObservation] else { return } - - var subtitleLines: [[String: Any]] = [] - var subtitleText = "" - var index = 0 - for observation in observations { - let candidate = observation.topCandidates(1).first - let string = candidate?.string ?? "" - let confidence = candidate?.confidence ?? 0.0 - let stringRange = string.startIndex ..< string.endIndex - let boxObservation = try? candidate?.boundingBox(for: stringRange) - let boundingBox = boxObservation?.boundingBox ?? .zero - let rect = VNImageRectForNormalizedRect(boundingBox, subtitle.imageWidth, subtitle.imageHeight) - - let line: [String: Any] = [ - "text": string, - "confidence": confidence, - "x": Int(rect.minX), - "width": Int(rect.size.width), - "y": Int(CGFloat(subtitle.imageHeight) - rect.minY - rect.size.height), - "height": Int(rect.size.height), - ] - - subtitleLines.append(line) - subtitleText += string - index += 1 - if index != observations.count { - subtitleText += "\n" - } - } - - let subtitleData: [String: Any] = [ - "image": subIndex, - "lines": subtitleLines, - "text": subtitleText, - ] - - srtStreams[track.trackNumber]?.appendSubtitle(SRTSubtitle(index: subIndex, - startTime: subtitle.timestamp, - endTime: subtitle.endTimestamp, - text: subtitleText)) - - } - - request.recognitionLevel = recognitionLevel - request.usesLanguageCorrection = languageCorrection - request.revision = revision - request.recognitionLanguages = languages - - try? VNImageRequestHandler(cgImage: subImage, options: [:]).perform([request]) - - subIndex += 1 - } + try processSubtitles(PGS: PGS, srtStream: srtStream, trackNumber: track.trackNumber, subIndex: subIndex, jsonStream: &jsonStream, logger: logger, manager: manager, recognitionLevel: recognitionLevel, languages: languages, revision: revision) } } else { // Open the PGS data stream @@ -160,173 +82,9 @@ struct macSubtitleOCR: ParsableCommand { let srtStream = SRT() srtStreams[0] = srtStream - for subtitle in PGS.getSubtitles() { - if subtitle.imageWidth == 0, subtitle.imageHeight == 0 { - logger.debug("Skipping subtitle index \(subIndex) with empty image data!") - continue - } - - guard let subImage = PGS.createImage(index: subIndex - 1) - else { - logger.info("Could not create image for index \(subIndex)! Skipping...") - continue - } - - // Save subtitle image as PNG if imageDirectory is provided - if let imageDirectory { - let outputDirectory = URL(fileURLWithPath: imageDirectory) - do { - try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil) - } catch CocoaError.fileWriteFileExists { - // Folder already existed - } - let pngPath = outputDirectory.appendingPathComponent("subtitle_\(subIndex).png") - - try saveImageAsPNG(image: subImage, outputPath: pngPath) - } - - // Perform text recognition - let request = VNRecognizeTextRequest { request, _ in - guard let observations = request.results as? [VNRecognizedTextObservation] else { return } - - var subtitleLines: [[String: Any]] = [] - var subtitleText = "" - var index = 0 - for observation in observations { - let candidate = observation.topCandidates(1).first - let string = candidate?.string ?? "" - let confidence = candidate?.confidence ?? 0.0 - let stringRange = string.startIndex ..< string.endIndex - let boxObservation = try? candidate?.boundingBox(for: stringRange) - let boundingBox = boxObservation?.boundingBox ?? .zero - let rect = VNImageRectForNormalizedRect(boundingBox, subtitle.imageWidth, subtitle.imageHeight) - - let line: [String: Any] = [ - "text": string, - "confidence": confidence, - "x": Int(rect.minX), - "width": Int(rect.size.width), - "y": Int(CGFloat(subtitle.imageHeight) - rect.minY - rect.size.height), - "height": Int(rect.size.height), - ] - - subtitleLines.append(line) - subtitleText += string - index += 1 - if index != observations.count { - subtitleText += "\n" - } - } - - let subtitleData: [String: Any] = [ - "image": subIndex, - "lines": subtitleLines, - "text": subtitleText, - ] - - jsonStream.append(subtitleData) - - // Append subtitle to SRT stream - srtStream.appendSubtitle(SRTSubtitle(index: subIndex, - startTime: subtitle.timestamp, - endTime: subtitle.endTimestamp, - text: subtitleText)) - } - - request.recognitionLevel = recognitionLevel - request.usesLanguageCorrection = languageCorrection - request.revision = revision - request.recognitionLanguages = languages - - try? VNImageRequestHandler(cgImage: subImage, options: [:]).perform([request]) - - subIndex += 1 - } + try processSubtitles(PGS: PGS, srtStream: srtStream, trackNumber: 0, subIndex: subIndex, jsonStream: &jsonStream, logger: logger, manager: manager, recognitionLevel: recognitionLevel, languages: languages, revision: revision) } - // for subtitle in PGS.getSubtitles() { - // if subtitle.imageWidth == 0, subtitle.imageHeight == 0 { - // logger.debug("Skipping subtitle index \(subIndex) with empty image data!") - // continue - // } - - // guard let subImage = PGS.createImage(index: subIndex - 1) - // else { - // logger.info("Could not create image for index \(subIndex)! Skipping...") - // continue - // } - - // // Save subtitle image as PNG if imageDirectory is provided - // if let imageDirectory { - // let outputDirectory = URL(fileURLWithPath: imageDirectory) - // do { - // try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil) - // } catch CocoaError.fileWriteFileExists { - // // Folder already existed - // } - // let pngPath = outputDirectory.appendingPathComponent("subtitle_\(subIndex).png") - - // try saveImageAsPNG(image: subImage, outputPath: pngPath) - // } - - // // Perform text recognition - // let request = VNRecognizeTextRequest { request, _ in - // guard let observations = request.results as? [VNRecognizedTextObservation] else { return } - - // var subtitleLines: [[String: Any]] = [] - // var subtitleText = "" - // var index = 0 - // for observation in observations { - // let candidate = observation.topCandidates(1).first - // let string = candidate?.string ?? "" - // let confidence = candidate?.confidence ?? 0.0 - // let stringRange = string.startIndex ..< string.endIndex - // let boxObservation = try? candidate?.boundingBox(for: stringRange) - // let boundingBox = boxObservation?.boundingBox ?? .zero - // let rect = VNImageRectForNormalizedRect(boundingBox, subtitle.imageWidth, subtitle.imageHeight) - - // let line: [String: Any] = [ - // "text": string, - // "confidence": confidence, - // "x": Int(rect.minX), - // "width": Int(rect.size.width), - // "y": Int(CGFloat(subtitle.imageHeight) - rect.minY - rect.size.height), - // "height": Int(rect.size.height), - // ] - - // subtitleLines.append(line) - // subtitleText += string - // index += 1 - // if index != observations.count { - // subtitleText += "\n" - // } - // } - - // let subtitleData: [String: Any] = [ - // "image": subIndex, - // "lines": subtitleLines, - // "text": subtitleText, - // ] - - // jsonStream.append(subtitleData) - - // // Append subtitle to SRT stream - // srtStream.appendSubtitle(SRTSubtitle(index: subIndex, - // startTime: subtitle.timestamp, - // endTime: subtitle.endTimestamp, - // text: subtitleText)) - // } - - // request.recognitionLevel = recognitionLevel - // request.usesLanguageCorrection = languageCorrection - // request.revision = revision - // request.recognitionLanguages = languages - - // try? VNImageRequestHandler(cgImage: subImage, options: [:]).perform([request]) - - // subIndex += 1 - // } - if let json { // Convert subtitle data to JSON let jsonData = try JSONSerialization.data(withJSONObject: jsonStream, options: [.prettyPrinted, .sortedKeys]) @@ -338,14 +96,6 @@ struct macSubtitleOCR: ParsableCommand { encoding: .utf8) } - if saveSup, inFile.hasSuffix(".mkv") { - try manager.moveItem( - at: URL(fileURLWithPath: sup), - to: URL(fileURLWithPath: inFile).deletingPathExtension().appendingPathExtension("sup")) - } else { - try manager.removeItem(at: URL(fileURLWithPath: sup)) - } - for (trackNumber, srtStream) in srtStreams { let outputDirectory = URL(fileURLWithPath: srtDirectory) do { @@ -355,9 +105,15 @@ struct macSubtitleOCR: ParsableCommand { } let srtFilePath = outputDirectory.appendingPathComponent("track_\(trackNumber).srt") try srtStream.write(toFileAt: srtFilePath) + if saveSup, inFile.hasSuffix(".mkv") { + let fileName = "\(trackNumber)_" + URL(fileURLWithPath: inFile).deletingPathExtension().appendingPathExtension("sup").lastPathComponent + try manager.moveItem( + at: URL(fileURLWithPath: intermediateFiles[trackNumber]!), + to: URL(fileURLWithPath: inFile).deletingLastPathComponent().appendingPathComponent(fileName)) + } else { + try manager.removeItem(at: URL(fileURLWithPath: intermediateFiles[trackNumber]!)) + } } - - //try srtStream.write(toFileAt: URL(fileURLWithPath: srt)) } // MARK: - Methods @@ -389,4 +145,91 @@ struct macSubtitleOCR: ParsableCommand { throw macSubtitleOCRError.fileWriteError } } + + private func processSubtitles(PGS: PGS, srtStream: SRT, trackNumber: Int, subIndex: Int, jsonStream: inout [Any], logger: Logger, manager: FileManager, recognitionLevel: VNRequestTextRecognitionLevel, languages: [String], revision: Int) throws { + var subIndex = subIndex + var inJson = jsonStream + for subtitle in PGS.getSubtitles() { + if subtitle.imageWidth == 0, subtitle.imageHeight == 0 { + logger.debug("Skipping subtitle index \(subIndex) with empty image data!") + continue + } + + guard let subImage = PGS.createImage(index: subIndex - 1) + else { + logger.info("Could not create image for index \(subIndex)! Skipping...") + continue + } + + // Save subtitle image as PNG if imageDirectory is provided + if let imageDirectory { + let outputDirectory = URL(fileURLWithPath: imageDirectory) + do { + try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil) + } catch CocoaError.fileWriteFileExists { + // Folder already existed + } + let pngPath = outputDirectory.appendingPathComponent("subtitle_\(subIndex).png") + + try saveImageAsPNG(image: subImage, outputPath: pngPath) + } + + // Perform text recognition + let request = VNRecognizeTextRequest { request, _ in + guard let observations = request.results as? [VNRecognizedTextObservation] else { return } + + var subtitleLines: [[String: Any]] = [] + var subtitleText = "" + var index = 0 + for observation in observations { + let candidate = observation.topCandidates(1).first + let string = candidate?.string ?? "" + let confidence = candidate?.confidence ?? 0.0 + let stringRange = string.startIndex ..< string.endIndex + let boxObservation = try? candidate?.boundingBox(for: stringRange) + let boundingBox = boxObservation?.boundingBox ?? .zero + let rect = VNImageRectForNormalizedRect(boundingBox, subtitle.imageWidth, subtitle.imageHeight) + + let line: [String: Any] = [ + "text": string, + "confidence": confidence, + "x": Int(rect.minX), + "width": Int(rect.size.width), + "y": Int(CGFloat(subtitle.imageHeight) - rect.minY - rect.size.height), + "height": Int(rect.size.height), + ] + + subtitleLines.append(line) + subtitleText += string + index += 1 + if index != observations.count { + subtitleText += "\n" + } + } + + let subtitleData: [String: Any] = [ + "image": subIndex, + "lines": subtitleLines, + "text": subtitleText, + ] + + inJson.append(subtitleData) + + srtStream.appendSubtitle(SRTSubtitle(index: subIndex, + startTime: subtitle.timestamp, + endTime: subtitle.endTimestamp, + text: subtitleText)) + } + + request.recognitionLevel = recognitionLevel + request.usesLanguageCorrection = languageCorrection + request.revision = revision + request.recognitionLanguages = languages + + try? VNImageRequestHandler(cgImage: subImage, options: [:]).perform([request]) + + subIndex += 1 + } + jsonStream = inJson + } } diff --git a/Tests/macSubtitleOCRTests/Resources/testFiles/test.mkv b/Tests/macSubtitleOCRTests/Resources/testFiles/test.mkv index cde67fd..c09de9d 100644 Binary files a/Tests/macSubtitleOCRTests/Resources/testFiles/test.mkv and b/Tests/macSubtitleOCRTests/Resources/testFiles/test.mkv differ