1 package com.neuronrobotics.bowlerstudio.lipsync;
3 import java.util.ArrayList;
4 import java.util.HashMap;
8 import com.google.common.reflect.TypeToken;
9 import com.google.gson.Gson;
10 import com.google.gson.GsonBuilder;
11 import com.neuronrobotics.bowlerstudio.AudioStatus;
12 import com.neuronrobotics.bowlerstudio.IAudioProcessingLambda;
13 import com.neuronrobotics.bowlerstudio.scripting.ScriptingEngine;
15 import java.io.BufferedInputStream;
16 import java.io.BufferedWriter;
18 import java.io.FileOutputStream;
19 import java.io.FileWriter;
20 import java.io.IOException;
21 import java.lang.reflect.Type;
24 import javax.sound.sampled.AudioFileFormat;
25 import javax.sound.sampled.AudioFormat;
26 import javax.sound.sampled.AudioInputStream;
27 import javax.sound.sampled.AudioSystem;
28 import javax.sound.sampled.DataLine;
29 import javax.sound.sampled.LineUnavailableException;
30 import javax.sound.sampled.TargetDataLine;
31 import javax.sound.sampled.UnsupportedAudioFileException;
33 import org.vosk.Model;
34 import org.vosk.Recognizer;
35 import net.lingala.zip4j.ZipFile;
52 class VoskResultWord {
58 public String toString() {
59 return "\n'" + word +
"' \n\tstarts at " + start +
" ends at " + end +
" confidence " + conf;
65 List<VoskResultWord> partial_result;
70 List<VoskResultWord> result;
73 Type partailType =
new TypeToken<VoskPartial>() {
75 Type resultType =
new TypeToken<VoskResultl>() {
78 static Gson gson =
new GsonBuilder().disableHtmlEscaping().setPrettyPrinting().create();
82 private static AudioFormat
format =
new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 60000, 16, 2, 4, 44100,
false);
93 .
fileFromGit(
"https://github.com/madhephaestus/TextToSpeechASDRTest.git",
"cmudict-0.7b.txt");
95 }
catch (Throwable t) {
113 File zipfile =
new File(pathTOModel);
115 if (!zipfile.exists()) {
117 String urlStr =
"https://alphacephei.com/vosk/models/" +
getModelName() +
".zip";
118 URL url =
new URL(urlStr);
119 BufferedInputStream bis =
new BufferedInputStream(url.openStream());
120 FileOutputStream fis =
new FileOutputStream(zipfile);
121 byte[] buffer =
new byte[1024];
123 System.out.println(
"Downloading Vosk Model " +
getModelName());
124 while ((count = bis.read(buffer, 0, 1024)) != -1) {
125 fis.write(buffer, 0, count);
126 System.out.print(
".");
131 String source = zipfile.getAbsolutePath();
133 System.out.println(
"Unzipping Vosk Model " +
getModelName());
134 ZipFile zipFile =
new ZipFile(source);
135 zipFile.extractAll(destination);
138 }
catch (Throwable t) {
144 int numBytesRead = 0;
145 int CHUNK_SIZE = 4096;
146 byte[] abData =
new byte[CHUNK_SIZE];
147 ArrayList<TimeCodedViseme> timeCodedVisemes =
null;
148 ArrayList<TimeCodedViseme> timeCodedVisemesCache =
new ArrayList<TimeCodedViseme>();
161 private void addWord(VoskResultWord word,
long len) {
163 double secLen = ((double) len) / 1000.0;
164 String w = word.word;
168 double wordStart = word.start;
169 double wordEnd = word.end;
170 double wordLen = wordEnd - wordStart;
171 ArrayList<String> phonemes =
dict.
find(w);
173 if (phonemes ==
null) {
178 double phonemeLength = wordLen / phonemes.size();
188 for (
int i = 0; i < phonemes.size(); i++) {
189 String phoneme = phonemes.get(i);
197 double myStart = Math.max(wordStart + phonemeLength * ((
double) i) +
timeLeadLag, 0);
198 double myEnd = wordStart + phonemeLength * ((double) (i + 1)) +
timeLeadLag;
199 double segLen = myEnd - myStart;
203 if (timeCodedVisemes.size() > 0) {
204 TimeCodedViseme tcLast = timeCodedVisemes.get(timeCodedVisemes.size() - 1);
205 if (myStart - tcLast.end > 0.03) {
208 double siLength = myStart - tcLast.end;
209 double hLength = siLength / 3.0;
210 double mouthClosedTime = myStart - hLength;
220 }
else if (myStart - tcLast.end > 0) {
229 if (i < phonemes.size() - 1) {
230 String next_phoneme = phonemes.get(i + 1);
254 double visLength = tc.end - tc.start;
255 double transLength = visLength / 3.0;
256 double transStart = tc.end - transLength;
303 timeCodedVisemes.add(v);
304 timeCodedVisemesCache.add(v);
309 if (wordList ==
null)
312 for (; words < wordList.size(); words++) {
313 VoskResultWord word = wordList.get(words);
319 public void processRaw(File f, String ttsLocation)
throws UnsupportedAudioFileException, IOException {
323 AudioInputStream getAudioInputStream = AudioSystem.getAudioInputStream(f);
324 long durationInMillis = (long) (1000 * getAudioInputStream.getFrameLength()
325 / getAudioInputStream.getFormat().getFrameRate());
326 long start = System.currentTimeMillis();
327 timeCodedVisemesCache.clear();
328 Thread t =
new Thread(() -> {
331 double secLen = ((double) durationInMillis) / 1000.0;
332 AudioInputStream ais = AudioSystem.getAudioInputStream(
format, getAudioInputStream);
333 Recognizer recognizer =
new Recognizer(
model, 120000);
334 recognizer.setWords(
true);
335 recognizer.setPartialWords(
true);
338 while ((numBytesRead != -1) && (!Thread.interrupted())) {
339 numBytesRead = ais.read(abData, 0, abData.length);
340 total += numBytesRead;
341 double tmpTotal = total;
342 double len = (ais.getFrameLength() * 2);
345 if (recognizer.acceptWaveForm(abData, numBytesRead)) {
346 String result = recognizer.getResult();
347 VoskResultl database = gson.fromJson(result, resultType);
350 String result = recognizer.getPartialResult();
351 VoskPartial database = gson.fromJson(result, partailType);
352 processWords(database.partial_result, durationInMillis);
355 VoskResultl database = gson.fromJson(recognizer.getFinalResult(), resultType);
359 if (timeCodedVisemes.size() > 0) {
360 TimeCodedViseme tcLast = timeCodedVisemes.get(timeCodedVisemes.size() - 1);
366 if (!json.exists()) {
367 json.createNewFile();
369 String s = gson.toJson(timeCodedVisemesCache);
370 BufferedWriter writer =
new BufferedWriter(
new FileWriter(json.getAbsolutePath()));
373 System.out.println(
"Lip Sync data writen to " + json.getAbsolutePath());
374 timeCodedVisemesCache.clear();
375 }
catch (Throwable tr) {
376 tr.printStackTrace();
382 && (System.currentTimeMillis() - start < durationInMillis)) {
385 }
catch (InterruptedException e) {
396 timeCodedVisemes =
new ArrayList<>();
400 long start = System.currentTimeMillis();
401 System.out.println(
"Vosk Lip Sync Begin writing..");
402 AudioSystem.write(ais, AudioFileFormat.Type.WAVE, audio);
403 ais = AudioSystem.getAudioInputStream(audio);
406 text.createNewFile();
408 FileWriter myWriter =
new FileWriter(text);
409 myWriter.write(TTSString);
411 }
catch (IOException e) {
416 System.out.println(
"Vosk Lip Sync Done writing! took " + (System.currentTimeMillis() - start));
417 }
catch (Exception e) {
426 double currentDerivitiveTerm,
double percent) {
429 if (timeCodedVisemes.size() > 0) {
432 double value = map.getEndPercentage();
433 if (percent > value) {
434 timeCodedVisemes.remove(0);
435 if (timeCodedVisemes.size() > 0)
436 ret = timeCodedVisemes.
get(0).status;
441 }
else if (percent > map.getStartPercentage())
448 if (current != ret) {
457 throw new RuntimeException(
"Vosk Model failed to load, check "
459 Recognizer recognizer =
new Recognizer(
model, 120000);
461 DataLine.Info info =
new DataLine.Info(TargetDataLine.class,
format);
462 TargetDataLine microphone;
463 microphone = (TargetDataLine) AudioSystem.getLine(info);
468 int CHUNK_SIZE = 1024;
471 byte[] b =
new byte[4096];
473 String result =
null;
474 long start = System.currentTimeMillis();
475 Type STTType =
new TypeToken<HashMap<String, String>>() {
478 while (((System.currentTimeMillis() - start) < 30000) && !Thread.interrupted()) {
480 numBytesRead = microphone.read(b, 0, CHUNK_SIZE);
481 bytesRead += numBytesRead;
483 if (recognizer.acceptWaveForm(b, numBytesRead)) {
484 result = recognizer.getResult();
485 HashMap<String, String> db = gson.fromJson(result, STTType);
486 result = db.get(
"text");
487 if (result.length() > 2)
496 }
catch (Throwable t) {
ArrayList< String > find(String w)
static void loadDictionary()
static double PercentageTimeOfLipSyncReadahead
AudioStatus toStatus(String phoneme)
AudioStatus update(AudioStatus current, double amplitudeUnitVector, double currentRollingAverage, double currentDerivitiveTerm, double percent)
void processRaw(File f, String ttsLocation)
static void setPercentageTimeOfLipSyncReadahead(double percentageTimeOfLipSyncReadahead)
static String getModelName()
static double getPercentageTimeOfLipSyncReadahead()
void setTimeLeadLag(double timeLeadLag)
static String promptFromMicrophone()
void add(TimeCodedViseme v)
void processWords(List< VoskResultWord > wordList, long len)
static PhoneticDictionary dict
AudioInputStream startProcessing(AudioInputStream ais, String TTSString)
static AudioFormat format
static void setModelName(String modelName)
static VoskLipSync singelton
void addWord(VoskResultWord word, long len)
static File fileFromGit(String remoteURI, String fileInRepo)
static File getWorkspace()
static AudioStatus get(char code)
static AudioStatus getFromPhoneme(String code)