Skip to content

Commit

Permalink
Simplify main program logic and refine tests
Browse files Browse the repository at this point in the history
Signed-off-by: Ethan Dye <[email protected]>
  • Loading branch information
ecdye committed Sep 26, 2024
1 parent 98ac956 commit 8193f26
Show file tree
Hide file tree
Showing 7 changed files with 184 additions and 924 deletions.
3 changes: 0 additions & 3 deletions Package.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// swift-tools-version: 6.0
// The swift-tools-version declares the minimum version of Swift required to build this package.

import PackageDescription

Expand All @@ -12,8 +11,6 @@ let package = Package(
.package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.5.0"),
],
targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(
name: "macSubtitleOCR",
dependencies: [
Expand Down
2 changes: 1 addition & 1 deletion Sources/macSubtitleOCR/MKV/MKVTrackParser.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// MKVTrackParser.swift
// macSubtitleOCR
//
// Created by Ethan Dye on 9/20/24.
// Created by Ethan Dye on 9/22/24.
// Copyright © 2024 Ethan Dye. All rights reserved.
//

Expand Down
84 changes: 73 additions & 11 deletions Sources/macSubtitleOCR/PGS/PGS.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,19 @@ class PGS {
return nil
}

return CGImage(width: subtitles[index].imageWidth,
height: subtitles[index].imageHeight,
bitsPerComponent: 8,
bitsPerPixel: 32,
bytesPerRow: subtitles[index].imageWidth * 4, // 4 bytes per pixel (RGBA)
space: colorSpace,
bitmapInfo: bitmapInfo,
provider: provider,
decode: nil,
shouldInterpolate: false,
intent: .defaultIntent)
let image = CGImage(width: subtitles[index].imageWidth,
height: subtitles[index].imageHeight,
bitsPerComponent: 8,
bitsPerPixel: 32,
bytesPerRow: subtitles[index].imageWidth * 4, // 4 bytes per pixel (RGBA)
space: colorSpace,
bitmapInfo: bitmapInfo,
provider: provider,
decode: nil,
shouldInterpolate: false,
intent: .defaultIntent)

return cropImageToVisibleArea(image!, buffer: 5)
}

// MARK: - Methods
Expand Down Expand Up @@ -172,4 +174,64 @@ class PGS {
}
}
}

private func cropImageToVisibleArea(_ image: CGImage, buffer: Int) -> CGImage? {
guard let dataProvider = image.dataProvider,
let data = dataProvider.data
else {
return nil
}

let width = image.width
let height = image.height
let bytesPerPixel = image.bitsPerPixel / 8
let bytesPerRow = image.bytesPerRow
let alphaInfo = image.alphaInfo

guard alphaInfo == .premultipliedLast || alphaInfo == .last else {
// Ensure that the image contains an alpha channel.
return nil
}

let pixelData = CFDataGetBytePtr(data)

var minX = width
var maxX = 0
var minY = height
var maxY = 0

// Iterate over each pixel to find the non-transparent bounding box
for y in 0 ..< height {
for x in 0 ..< width {
let pixelIndex = y * bytesPerRow + x * bytesPerPixel
let alpha = pixelData![pixelIndex + 3] // Assuming RGBA format

if alpha > 0 { // Non-transparent pixel
if x < minX { minX = x }
if x > maxX { maxX = x }
if y < minY { minY = y }
if y > maxY { maxY = y }
}
}
}

// Check if the image is fully transparent, return nil if so
if minX == width || maxX == 0 || minY == height || maxY == 0 {
return nil // Fully transparent image
}

// Add buffer to the bounding box, ensuring it's clamped within the image bounds
minX = max(0, minX - buffer)
maxX = min(width - 1, maxX + buffer)
minY = max(0, minY - buffer)
maxY = min(height - 1, maxY + buffer)

let croppedWidth = maxX - minX + 1
let croppedHeight = maxY - minY + 1

let croppedRect = CGRect(x: minX, y: minY, width: croppedWidth, height: croppedHeight)

// Create a cropped image from the original image
return image.cropping(to: croppedRect)
}
}
142 changes: 61 additions & 81 deletions Sources/macSubtitleOCR/macSubtitleOCR.swift
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
//

import ArgumentParser
import Cocoa
import os
import UniformTypeIdentifiers
import Vision

private let logger: Logger = .init(subsystem: "github.ecdye.macSubtitleOCR", category: "main")

// The main struct representing the macSubtitleOCR command-line tool.
@main
struct macSubtitleOCR: ParsableCommand {
Expand All @@ -20,45 +21,34 @@ struct macSubtitleOCR: ParsableCommand {
@Argument(help: "Input file containing the subtitle stream (.sup or .mkv)")
var input: String

@Argument(help: "Directory to output the completed .srt files to")
var srtDirectory: String

@Option(help: "File to output the OCR direct output in json to (optional)")
var json: String?

@Option(help: "Output image files of subtitles to directory (optional)")
var imageDirectory: String?
@Argument(help: "Directory to output files to")
var outputDirectory: String

@Option(wrappedValue: "en", help: "The input image language(s)")
var language: String

@Flag(help: "Save image files for subtitle track (optional)")
var saveImages = false

@Flag(help: "Save the raw json results from OCR (optional)")
var json = false

@Flag(help: "Enable fast mode (less accurate)")
var fastMode = false

@Flag(help: "Enable language correction")
var languageCorrection = false

@Flag(help: "Save extracted `.sup` file to disk (MKV input only)")
var saveSup: Bool = false
var saveSup = false

// MARK: - Entrypoint

mutating func run() throws {
// Setup utilities
let logger = Logger(subsystem: "github.ecdye.macSubtitleOCR", category: "main")
let manager = FileManager.default

// Setup options
let inFile = input
let revision = setOCRRevision()
let recognitionLevel = setOCRMode()
let languages = language.split(separator: ",").map { String($0) }

// Setup data variables
func run() throws {
let fileManager = FileManager.default
var subIndex = 1
var jsonStream: [Any] = []
var srtStreams: [Int: SRT] = [:]
var intermediateFiles: [Int: String] = [:]
var results: [macSubtitleOCRResult] = []

if input.hasSuffix(".mkv") {
let mkvStream = try MKVSubtitleExtractor(filePath: input)
Expand All @@ -71,62 +61,55 @@ struct macSubtitleOCR: ParsableCommand {
// Open the PGS data stream
let PGS = try PGS(URL(fileURLWithPath: intermediateFiles[track.trackNumber]!))

let srtStream = SRT()
srtStreams[track.trackNumber] = srtStream

try processSubtitles(PGS: PGS, srtStream: srtStream, trackNumber: track.trackNumber, subIndex: subIndex, jsonStream: &jsonStream, logger: logger, manager: manager, recognitionLevel: recognitionLevel, languages: languages, revision: revision)
let result = try processSubtitles(PGS: PGS, trackNumber: track.trackNumber, subIndex: subIndex)
results.append(result)
}
} else {
// Open the PGS data stream
let PGS = try PGS(URL(fileURLWithPath: input))
let srtStream = SRT()
srtStreams[0] = srtStream

try processSubtitles(PGS: PGS, srtStream: srtStream, trackNumber: 0, subIndex: subIndex, jsonStream: &jsonStream, logger: logger, manager: manager, recognitionLevel: recognitionLevel, languages: languages, revision: revision)
}

if let json {
// Convert subtitle data to JSON
let jsonData = try JSONSerialization.data(withJSONObject: jsonStream, options: [.prettyPrinted, .sortedKeys])
let jsonString = String(data: jsonData, encoding: .utf8) ?? "[]"

// Write JSON to file
try jsonString.write(to: URL(fileURLWithPath: json),
atomically: true,
encoding: .utf8)
let result = try processSubtitles(PGS: PGS, trackNumber: 0, subIndex: subIndex)
results.append(result)
}

for (trackNumber, srtStream) in srtStreams {
let outputDirectory = URL(fileURLWithPath: srtDirectory)
do {
try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil)
} catch CocoaError.fileWriteFileExists {
// Folder already existed
let outputDirectory = URL(fileURLWithPath: outputDirectory)
try fileManager.createDirectory(at: outputDirectory, withIntermediateDirectories: true, attributes: nil)
for result in results {
// Save srt file
let srtFilePath = outputDirectory.appendingPathComponent("track_\(result.trackNumber).srt")
try result.srt.write(toFileAt: srtFilePath)

// Save json file
if json {
// Convert subtitle data to JSON
let jsonData = try JSONSerialization.data(withJSONObject: result.json, options: [.prettyPrinted, .sortedKeys])
let jsonString = (String(data: jsonData, encoding: .utf8) ?? "[]")
let jsonFilePath = outputDirectory.appendingPathComponent("track_\(result.trackNumber).json")
try jsonString.write(to: jsonFilePath, atomically: true, encoding: .utf8)
}
let srtFilePath = outputDirectory.appendingPathComponent("track_\(trackNumber).srt")
try srtStream.write(toFileAt: srtFilePath)
if saveSup, inFile.hasSuffix(".mkv") {
let fileName = "\(trackNumber)_" + URL(fileURLWithPath: inFile).deletingPathExtension().appendingPathExtension("sup").lastPathComponent
try manager.moveItem(
at: URL(fileURLWithPath: intermediateFiles[trackNumber]!),
to: URL(fileURLWithPath: inFile).deletingLastPathComponent().appendingPathComponent(fileName))
} else if inFile.hasSuffix(".mkv") {
try manager.removeItem(at: URL(fileURLWithPath: intermediateFiles[trackNumber]!))

// Save or remove intermediate files
if saveSup, input.hasSuffix(".mkv") {
let subtitleFilePath = outputDirectory.appendingPathComponent("track_\(result.trackNumber).sup")
try fileManager.moveItem(
at: URL(fileURLWithPath: intermediateFiles[result.trackNumber]!),
to: subtitleFilePath)
} else if input.hasSuffix(".mkv") {
try fileManager.removeItem(at: URL(fileURLWithPath: intermediateFiles[result.trackNumber]!))
}
}
}

// MARK: - Methods

private func setOCRMode() -> VNRequestTextRecognitionLevel {
private func getOCRMode() -> VNRequestTextRecognitionLevel {
if fastMode {
VNRequestTextRecognitionLevel.fast
} else {
VNRequestTextRecognitionLevel.accurate
}
}

private func setOCRRevision() -> Int {
private func getOCRRevision() -> Int {
if #available(macOS 13, *) {
VNRecognizeTextRequestRevision3
} else {
Expand All @@ -146,9 +129,11 @@ struct macSubtitleOCR: ParsableCommand {
}
}

private func processSubtitles(PGS: PGS, srtStream: SRT, trackNumber _: Int, subIndex: Int, jsonStream: inout [Any], logger: Logger, manager: FileManager, recognitionLevel: VNRequestTextRecognitionLevel, languages: [String], revision: Int) throws {
private func processSubtitles(PGS: PGS, trackNumber: Int, subIndex: Int) throws -> macSubtitleOCRResult {
var subIndex = subIndex
var inJson = jsonStream
var json: [Any] = []
let srt = SRT()

for subtitle in PGS.getSubtitles() {
if subtitle.imageWidth == 0, subtitle.imageHeight == 0 {
logger.debug("Skipping subtitle index \(subIndex) with empty image data!")
Expand All @@ -161,16 +146,11 @@ struct macSubtitleOCR: ParsableCommand {
continue
}

// Save subtitle image as PNG if imageDirectory is provided
if let imageDirectory {
let outputDirectory = URL(fileURLWithPath: imageDirectory)
do {
try manager.createDirectory(at: outputDirectory, withIntermediateDirectories: false, attributes: nil)
} catch CocoaError.fileWriteFileExists {
// Folder already existed
}
let pngPath = outputDirectory.appendingPathComponent("subtitle_\(subIndex).png")

// Save subtitle image as PNG if requested
if saveImages {
let imageDirectory = URL(fileURLWithPath: outputDirectory).appendingPathComponent("images/" + "track_\(trackNumber)/")
try FileManager.default.createDirectory(at: imageDirectory, withIntermediateDirectories: true, attributes: nil)
let pngPath = imageDirectory.appendingPathComponent("subtitle_\(subIndex).png")
try saveImageAsPNG(image: subImage, outputPath: pngPath)
}

Expand Down Expand Up @@ -213,23 +193,23 @@ struct macSubtitleOCR: ParsableCommand {
"text": subtitleText,
]

inJson.append(subtitleData)
json.append(subtitleData)

srtStream.appendSubtitle(SRTSubtitle(index: subIndex,
startTime: subtitle.timestamp,
endTime: subtitle.endTimestamp,
text: subtitleText))
srt.appendSubtitle(SRTSubtitle(index: subIndex,
startTime: subtitle.timestamp,
endTime: subtitle.endTimestamp,
text: subtitleText))
}

request.recognitionLevel = recognitionLevel
request.recognitionLevel = getOCRMode()
request.usesLanguageCorrection = languageCorrection
request.revision = revision
request.recognitionLanguages = languages
request.revision = getOCRRevision()
request.recognitionLanguages = language.split(separator: ",").map { String($0) }

try? VNImageRequestHandler(cgImage: subImage, options: [:]).perform([request])

subIndex += 1
}
jsonStream = inJson
return macSubtitleOCRResult(trackNumber: trackNumber, srt: srt, json: json)
}
}
13 changes: 13 additions & 0 deletions Sources/macSubtitleOCR/macSubtitleOCRResult.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
//
// macSubtitleOCRResult.swift
// macSubtitleOCR
//
// Created by Ethan Dye on 9/25/24.
// Copyright © 2024 Ethan Dye. All rights reserved.
//

struct macSubtitleOCRResult {
var trackNumber: Int
var srt: SRT
var json: [Any]
}
Loading

0 comments on commit 8193f26

Please sign in to comment.