Skip to content

Commit

Permalink
Calculate global RTF in example app
Browse files Browse the repository at this point in the history
- Also updates UI for default stream mode to match eager mode
  • Loading branch information
ZachNagengast committed Apr 1, 2024
1 parent e004b47 commit 5572cd6
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 17 deletions.
4 changes: 4 additions & 0 deletions Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
161135DE2B3F66DA003C20F6 /* WhisperAX Watch App.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "WhisperAX Watch App.app"; sourceTree = BUILT_PRODUCTS_DIR; };
161135F02B3F66DC003C20F6 /* WhisperAX Watch AppTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "WhisperAX Watch AppTests.xctest"; sourceTree = BUILT_PRODUCTS_DIR; };
161135FA2B3F66DC003C20F6 /* WhisperAX Watch AppUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "WhisperAX Watch AppUITests.xctest"; sourceTree = BUILT_PRODUCTS_DIR; };
1626683A2BB90CC9008F950A /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; path = Info.plist; sourceTree = "<group>"; };
1677AFA62B57618A008C61C0 /* WhisperAX_Watch_AppUITests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = WhisperAX_Watch_AppUITests.swift; sourceTree = "<group>"; };
1677AFA72B57618A008C61C0 /* WhisperAX_Watch_AppUITestsLaunchTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = WhisperAX_Watch_AppUITestsLaunchTests.swift; sourceTree = "<group>"; };
1677AFA92B57618A008C61C0 /* WhisperAX.entitlements */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.entitlements; path = WhisperAX.entitlements; sourceTree = "<group>"; };
Expand Down Expand Up @@ -166,6 +167,7 @@
1677AFA82B57618A008C61C0 /* WhisperAX */ = {
isa = PBXGroup;
children = (
1626683A2BB90CC9008F950A /* Info.plist */,
1677AFAB2B57618A008C61C0 /* WhisperAXApp.swift */,
1677AFE42B5769E5008C61C0 /* Views */,
1677AFD72B576375008C61C0 /* Resources */,
Expand Down Expand Up @@ -874,6 +876,7 @@
ENABLE_HARDENED_RUNTIME = YES;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_FILE = WhisperAX/Info.plist;
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.utilities";
INFOPLIST_KEY_NSMicrophoneUsageDescription = "Required to record audio from the microphone for transcription.";
"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
Expand Down Expand Up @@ -918,6 +921,7 @@
ENABLE_HARDENED_RUNTIME = YES;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_FILE = WhisperAX/Info.plist;
INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.utilities";
INFOPLIST_KEY_NSMicrophoneUsageDescription = "Required to record audio from the microphone for transcription.";
"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
Expand Down
11 changes: 11 additions & 0 deletions Examples/WhisperAX/WhisperAX/Info.plist
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>NSPrivacyAccessedAPITypes</key>
<dict>
<key>NSPrivacyAccessedAPIType</key>
<string>NSPrivacyAccessedAPICategoryUserDefaults</string>
</dict>
</dict>
</plist>
63 changes: 46 additions & 17 deletions Examples/WhisperAX/WhisperAX/Views/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,13 @@ struct ContentView: View {
@State private var isFilePickerPresented = false
@State private var firstTokenTime: TimeInterval = 0
@State private var pipelineStart: TimeInterval = 0
@State private var realTimeFactor: TimeInterval = 0
@State private var effectiveRealTimeFactor: TimeInterval = 0
@State private var totalInferenceTime: TimeInterval = 0
@State private var tokensPerSecond: TimeInterval = 0
@State private var currentLag: TimeInterval = 0
@State private var currentFallbacks: Int = 0
@State private var currentEncodingLoops: Int = 0
@State private var currentDecodingLoops: Int = 0
@State private var lastBufferSize: Int = 0
@State private var lastConfirmedSegmentEndSeconds: Float = 0
@State private var requiredSegmentsForConfirmation: Int = 4
Expand Down Expand Up @@ -110,10 +113,13 @@ struct ContentView: View {

firstTokenTime = 0
pipelineStart = 0
realTimeFactor = 0
effectiveRealTimeFactor = 0
totalInferenceTime = 0
tokensPerSecond = 0
currentLag = 0
currentFallbacks = 0
currentEncodingLoops = 0
currentDecodingLoops = 0
lastBufferSize = 0
lastConfirmedSegmentEndSeconds = 0
requiredSegmentsForConfirmation = 2
Expand Down Expand Up @@ -225,6 +231,7 @@ struct ContentView: View {
.foregroundColor(.secondary)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
.padding(.top)
}
} else {
ForEach(Array(confirmedSegments.enumerated()), id: \.element) { _, segment in
Expand All @@ -240,20 +247,23 @@ struct ContentView: View {
let timestampText = enableTimestamps ? "[\(String(format: "%.2f", segment.start)) --> \(String(format: "%.2f", segment.end))]" : ""
Text(timestampText + segment.text)
.font(.headline)
.fontWeight(.light)
.fontWeight(.bold)
.foregroundColor(.gray)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
}
if enableDecoderPreview {
Text("\(unconfirmedText.joined(separator: "\n"))")
.font(.caption)
.foregroundColor(.secondary)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
Text("\(currentText)")
.font(.caption)
.foregroundColor(.secondary)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
}
Text("\(unconfirmedText.joined(separator: "\n"))")
.font(.caption)
.foregroundColor(.secondary)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
Text("\(currentText)")
.font(.caption)
.foregroundColor(.secondary)
.multilineTextAlignment(.leading)
.frame(maxWidth: .infinity, alignment: .leading)
}
}
}
Expand Down Expand Up @@ -552,6 +562,14 @@ struct ContentView: View {
.disabled(modelState != .loaded)
.frame(minWidth: 0, maxWidth: .infinity)

VStack {
Text("Encoder runs: \(currentEncodingLoops)")
.font(.caption)
Text("Decoder runs: \(currentDecodingLoops)")
.font(.caption)
}
.offset(x: -120, y: 0)

if isRecording {
Text("\(String(format: "%.1f", bufferSeconds)) s")
.font(.caption)
Expand Down Expand Up @@ -602,7 +620,7 @@ struct ContentView: View {
.padding(.top)

HStack {
Text(realTimeFactor.formatted(.number.precision(.fractionLength(3))) + " RTF")
Text(effectiveRealTimeFactor.formatted(.number.precision(.fractionLength(3))) + " RTF")
.font(.system(.body))
.lineLimit(1)
Spacer()
Expand Down Expand Up @@ -1104,7 +1122,8 @@ struct ContentView: View {
}

self.tokensPerSecond = transcription?.timings?.tokensPerSecond ?? 0
self.realTimeFactor = transcription?.timings?.realTimeFactor ?? 0
self.effectiveRealTimeFactor = transcription?.timings?.realTimeFactor ?? 0
self.currentEncodingLoops = Int(transcription?.timings?.totalEncodingRuns ?? 0)
self.firstTokenTime = transcription?.timings?.firstTokenTime ?? 0
self.pipelineStart = transcription?.timings?.pipelineStart ?? 0
self.currentLag = transcription?.timings?.decodingLoop ?? 0
Expand Down Expand Up @@ -1147,6 +1166,7 @@ struct ContentView: View {
}
self.currentText = progress.text
self.currentFallbacks = fallbacks
self.currentDecodingLoops += 1
}
// Check early stopping
let currentTokens = progress.tokens
Expand Down Expand Up @@ -1261,10 +1281,14 @@ struct ContentView: View {
let transcription = try await transcribeEagerMode(Array(currentBuffer))
await MainActor.run {
self.tokensPerSecond = transcription?.timings?.tokensPerSecond ?? 0
self.realTimeFactor = transcription?.timings?.realTimeFactor ?? 0
self.firstTokenTime = transcription?.timings?.firstTokenTime ?? 0
self.pipelineStart = transcription?.timings?.pipelineStart ?? 0
self.currentLag = transcription?.timings?.decodingLoop ?? 0
self.currentEncodingLoops = Int(transcription?.timings?.totalEncodingRuns ?? 0)

let totalAudio = Double(currentBuffer.count) / Double(WhisperKit.sampleRate)
self.totalInferenceTime = transcription?.timings?.fullPipeline ?? 0
self.effectiveRealTimeFactor = Double(totalInferenceTime) / totalAudio
}
} else {
// Run realtime transcribe using timestamp tokens directly
Expand All @@ -1279,10 +1303,14 @@ struct ContentView: View {
}

self.tokensPerSecond = transcription?.timings?.tokensPerSecond ?? 0
self.realTimeFactor = transcription?.timings?.realTimeFactor ?? 0
self.firstTokenTime = transcription?.timings?.firstTokenTime ?? 0
self.pipelineStart = transcription?.timings?.pipelineStart ?? 0
self.currentLag = transcription?.timings?.decodingLoop ?? 0
self.currentEncodingLoops += Int(transcription?.timings?.totalEncodingRuns ?? 0)

let totalAudio = Double(currentBuffer.count) / Double(WhisperKit.sampleRate)
self.totalInferenceTime += transcription?.timings?.fullPipeline ?? 0
self.effectiveRealTimeFactor = Double(totalInferenceTime) / totalAudio

// Logic for moving segments to confirmedSegments
if segments.count > requiredSegmentsForConfirmation {
Expand Down Expand Up @@ -1352,6 +1380,7 @@ struct ContentView: View {
}
self.currentText = progress.text
self.currentFallbacks = fallbacks
self.currentDecodingLoops += 1
}
// Check early stopping
let currentTokens = progress.tokens
Expand Down

0 comments on commit 5572cd6

Please sign in to comment.