BowlerKernel
VoskLipSync.java
Go to the documentation of this file.
1 package com.neuronrobotics.bowlerstudio.lipsync;
2 
3 import java.util.ArrayList;
4 import java.util.HashMap;
5 import java.util.List;
6 import java.util.Map;
7 
8 import com.google.common.reflect.TypeToken;
9 import com.google.gson.Gson;
10 import com.google.gson.GsonBuilder;
11 import com.neuronrobotics.bowlerstudio.AudioStatus;
12 import com.neuronrobotics.bowlerstudio.IAudioProcessingLambda;
13 import com.neuronrobotics.bowlerstudio.scripting.ScriptingEngine;
14 
15 import java.io.BufferedInputStream;
16 import java.io.BufferedWriter;
17 import java.io.File;
18 import java.io.FileOutputStream;
19 import java.io.FileWriter;
20 import java.io.IOException;
21 import java.lang.reflect.Type;
22 import java.net.URL;
23 
24 import javax.sound.sampled.AudioFileFormat;
25 import javax.sound.sampled.AudioFormat;
26 import javax.sound.sampled.AudioInputStream;
27 import javax.sound.sampled.AudioSystem;
28 import javax.sound.sampled.DataLine;
29 import javax.sound.sampled.LineUnavailableException;
30 import javax.sound.sampled.TargetDataLine;
31 import javax.sound.sampled.UnsupportedAudioFileException;
32 
33 import org.vosk.Model;
34 import org.vosk.Recognizer;
35 import net.lingala.zip4j.ZipFile;
36 
37 public class VoskLipSync implements IAudioProcessingLambda {
38  private static double PercentageTimeOfLipSyncReadahead = 2;
39  private static VoskLipSync singelton = null;
40 
41  private VoskLipSync() {
42 
43  }
44 
45  public static VoskLipSync get() {
46  if (singelton == null) {
47  singelton = new VoskLipSync();
48  }
49  return singelton;
50  }
51 
52  class VoskResultWord {
53  double conf;
54  double end;
55  double start;
56  String word;
57 
58  public String toString() {
59  return "\n'" + word + "' \n\tstarts at " + start + " ends at " + end + " confidence " + conf;
60  }
61  }
62 
63  class VoskPartial {
64  String partial;
65  List<VoskResultWord> partial_result;
66  }
67 
68  class VoskResultl {
69  String text;
70  List<VoskResultWord> result;
71  }
72 
73  Type partailType = new TypeToken<VoskPartial>() {
74  }.getType();
75  Type resultType = new TypeToken<VoskResultl>() {
76  }.getType();
77 
78  static Gson gson = new GsonBuilder().disableHtmlEscaping().setPrettyPrinting().create();
79  private static Model model;
80  private static String modelName;
81  private static PhoneticDictionary dict;
82  private static AudioFormat format = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 60000, 16, 2, 4, 44100, false);
83 
84  static {
85 
86  setModelName("vosk-model-en-us-daanzu-20200905");
88  }
89 
90  public static void loadDictionary() {
91  try {
92  File phoneticDatabaseFile = ScriptingEngine
93  .fileFromGit("https://github.com/madhephaestus/TextToSpeechASDRTest.git", "cmudict-0.7b.txt");
94  dict = new PhoneticDictionary(phoneticDatabaseFile);
95  } catch (Throwable t) {
96  t.printStackTrace();
97  }
98  }
99 
103  public static String getModelName() {
104  return modelName;
105  }
106 
110  public static void setModelName(String modelName) {
112  String pathTOModel = ScriptingEngine.getWorkspace().getAbsolutePath() + "/" + getModelName() + ".zip";
113  File zipfile = new File(pathTOModel);
114  try {
115  if (!zipfile.exists()) {
116 
117  String urlStr = "https://alphacephei.com/vosk/models/" + getModelName() + ".zip";
118  URL url = new URL(urlStr);
119  BufferedInputStream bis = new BufferedInputStream(url.openStream());
120  FileOutputStream fis = new FileOutputStream(zipfile);
121  byte[] buffer = new byte[1024];
122  int count = 0;
123  System.out.println("Downloading Vosk Model " + getModelName());
124  while ((count = bis.read(buffer, 0, 1024)) != -1) {
125  fis.write(buffer, 0, count);
126  System.out.print(".");
127  }
128  fis.close();
129  bis.close();
130 
131  String source = zipfile.getAbsolutePath();
132  String destination = ScriptingEngine.getWorkspace().getAbsolutePath();
133  System.out.println("Unzipping Vosk Model " + getModelName());
134  ZipFile zipFile = new ZipFile(source);
135  zipFile.extractAll(destination);
136  }
137  model = new Model(ScriptingEngine.getWorkspace().getAbsolutePath() + "/" + getModelName() + "/");
138  } catch (Throwable t) {
139  t.printStackTrace();
140  model = null;
141  }
142  }
143 
144  int numBytesRead = 0;
145  int CHUNK_SIZE = 4096;
146  byte[] abData = new byte[CHUNK_SIZE];
147  ArrayList<TimeCodedViseme> timeCodedVisemes = null;
148  ArrayList<TimeCodedViseme> timeCodedVisemesCache = new ArrayList<TimeCodedViseme>();
149  int words = 0;
150  private double positionInTrack;
151  private double timeLeadLag = 0.5;
152 
153  private AudioStatus toStatus(String phoneme) {
155  if (s != null)
156  return s;
157  // println "Unknown phoneme "+phoneme
158  return AudioStatus.X_NO_SOUND;
159  }
160 
161  private void addWord(VoskResultWord word, long len) {
162 
163  double secLen = ((double) len) / 1000.0;
164  String w = word.word;
165  if (w == null)
166  return;
167 
168  double wordStart = word.start;
169  double wordEnd = word.end;
170  double wordLen = wordEnd - wordStart;
171  ArrayList<String> phonemes = dict.find(w);
172  // println w + ", " + wordStart + ", " + phonemes;
173  if (phonemes == null) {
174  // println "\n\n unknown word "+w+"\n\n"
175  return;
176  }
177 
178  double phonemeLength = wordLen / phonemes.size();
179 
180  // Random rand = new Random();
181  double timeLeadLag = -(1 / 24.0 / 2048); // -0.0416667 // rand.nextDouble() / 10.0 //0.04
182 
183  // @finn this is where to adjust the lead/lag of the lip sync with the audio
184  // playback
185  // mtc -- this is where we can fuck with sequencing and add transition frames.
186  // the transition's probably going to require some sort of javaFX bullshit but
187  // we'll see.
188  for (int i = 0; i < phonemes.size(); i++) {
189  String phoneme = phonemes.get(i);
190  AudioStatus stat = toStatus(phoneme);
191 
192  // short the LeadLag for the H_L_SOUNDS viseme
193  if (stat == AudioStatus.H_L_SOUNDS) {
194  timeLeadLag = -(1 / 24.0 / 128);
195  }
196 
197  double myStart = Math.max(wordStart + phonemeLength * ((double) i) + timeLeadLag, 0);
198  double myEnd = wordStart + phonemeLength * ((double) (i + 1)) + timeLeadLag;
199  double segLen = myEnd - myStart;
200  TimeCodedViseme tc = new TimeCodedViseme(stat, myStart, myEnd, secLen);
201 
202  // adds a transitional silent viseme when a silence is detected
203  if (timeCodedVisemes.size() > 0) {
204  TimeCodedViseme tcLast = timeCodedVisemes.get(timeCodedVisemes.size() - 1);
205  if (myStart - tcLast.end > 0.03) {
206 
207  // for longer pauses, transition through partially open mouth to close
208  double siLength = myStart - tcLast.end;
209  double hLength = siLength / 3.0;
210  double mouthClosedTime = myStart - hLength;
211 
212  TimeCodedViseme tcSilentH = new TimeCodedViseme(AudioStatus.H_L_SOUNDS, tcLast.end, mouthClosedTime,
213  secLen);
214  TimeCodedViseme tcSilentX = new TimeCodedViseme(AudioStatus.X_NO_SOUND, mouthClosedTime, myStart,
215  secLen);
216 
217  // println "ln 297";
218  add(tcSilentH);
219  add(tcSilentX);
220  } else if (myStart - tcLast.end > 0) {
221  // short transition to partially open mouth
222  TimeCodedViseme tcSilent = new TimeCodedViseme(AudioStatus.H_L_SOUNDS, tcLast.end, myStart, secLen);
223  add(tcSilent);
224  }
225  }
226 
227  // looks for transition situations between visemes within a word (i.e. it bails
228  // at the last syllable)
229  if (i < phonemes.size() - 1) {
230  String next_phoneme = phonemes.get(i + 1);
231  AudioStatus stat_next = toStatus(next_phoneme);
232  // identifies transition sitautions
233  // ⒶⒸⒹ and ⒷⒸⒹ
234  // ⒸⒺⒻ and ⒹⒺⒻ
235  if (
236  // A or B preceeding D
237  (stat_next == AudioStatus.D_AA_SOUNDS
238  && (stat == AudioStatus.A_PBM_SOUNDS || stat == AudioStatus.B_KST_SOUNDS)) ||
239  // D preceeding A or B
240  ((stat_next == AudioStatus.A_PBM_SOUNDS || stat_next == AudioStatus.B_KST_SOUNDS)
241  && stat == AudioStatus.D_AA_SOUNDS)
242  ||
243  // C or D preceeding an F
244  (stat_next == AudioStatus.F_UW_OW_W_SOUNDS
245  && (stat == AudioStatus.C_EH_AE_SOUNDS || stat == AudioStatus.D_AA_SOUNDS))
246  ||
247  // F preceeding a C or D
248  ((stat_next == AudioStatus.C_EH_AE_SOUNDS || stat_next == AudioStatus.D_AA_SOUNDS)
249  && stat == AudioStatus.F_UW_OW_W_SOUNDS)) {
250  // println "transition situation detected";
251 
252  // determine the current length of the viseme, and the length and start point of
253  // the transition to be applied
254  double visLength = tc.end - tc.start;
255  double transLength = visLength / 3.0;
256  double transStart = tc.end - transLength;
257 
258  AudioStatus transViseme = tc.status;
259 
260  // based on the situation, set the appropriate transition viseme
261  if (stat_next == AudioStatus.F_UW_OW_W_SOUNDS || stat == AudioStatus.F_UW_OW_W_SOUNDS) {
262  // C or D found preceeding an F, or
263  // F found preceeding a C or D
264  // println "E_AO_ER inserted"
265  transViseme = AudioStatus.E_AO_ER_SOUNDS;
266  } else if (stat_next == AudioStatus.D_AA_SOUNDS || stat == AudioStatus.D_AA_SOUNDS) {
267  // A or B found preceeding a D, or
268  // D found preceeding an A or B
269  // println "C_EH_AE inserted"
270  transViseme = AudioStatus.C_EH_AE_SOUNDS;
271  } else {
272  // println "ERR_TRANSITION"
273  }
274 
275  // create the transition viseme
276  TimeCodedViseme tc_transition = new TimeCodedViseme(transViseme, transStart, tc.end, secLen);
277 
278  // push back the end point of the main viseme to the start point of the
279  // transition viseme
280  tc.end = transStart;
281 
282  // add the modified original viseme, and then the transition viseme
283  add(tc);
284  add(tc_transition);
285  } else {
286  // handles situations within words where the following viseme does not require a
287  // transition
288  add(tc);
289  }
290  } else {
291  // handles situations at the end of words
292  add(tc);
293  }
294  }
295 
296  // println "Word "+w+" starts at "+wordStart+" ends at "+wordEnd+" each phoneme
297  // length "+phonemeLength+" "+phonemes+" "+timeCodedVisemes
298 
299  }
300 
301  private void add(TimeCodedViseme v) {
302  // println "Adding "+ v
303  timeCodedVisemes.add(v);
304  timeCodedVisemesCache.add(v);
305 
306  }
307 
308  private void processWords(List<VoskResultWord> wordList, long len) {
309  if (wordList == null)
310  return;
311 
312  for (; words < wordList.size(); words++) {
313  VoskResultWord word = wordList.get(words);
314  addWord(word, len);
315  }
316 
317  }
318 
319  public void processRaw(File f, String ttsLocation) throws UnsupportedAudioFileException, IOException {
320 
321  words = 0;
322  positionInTrack = 0;
323  AudioInputStream getAudioInputStream = AudioSystem.getAudioInputStream(f);
324  long durationInMillis = (long) (1000 * getAudioInputStream.getFrameLength()
325  / getAudioInputStream.getFormat().getFrameRate());
326  long start = System.currentTimeMillis();
327  timeCodedVisemesCache.clear();
328  Thread t = new Thread(() -> {
329  try {
330 
331  double secLen = ((double) durationInMillis) / 1000.0;
332  AudioInputStream ais = AudioSystem.getAudioInputStream(format, getAudioInputStream);
333  Recognizer recognizer = new Recognizer(model, 120000);
334  recognizer.setWords(true);
335  recognizer.setPartialWords(true);
336  numBytesRead = 0;
337  long total = 0;
338  while ((numBytesRead != -1) && (!Thread.interrupted())) {
339  numBytesRead = ais.read(abData, 0, abData.length);
340  total += numBytesRead;
341  double tmpTotal = total;
342  double len = (ais.getFrameLength() * 2);
343  positionInTrack = tmpTotal / len * 100.0;
344 
345  if (recognizer.acceptWaveForm(abData, numBytesRead)) {
346  String result = recognizer.getResult();
347  VoskResultl database = gson.fromJson(result, resultType);
348  processWords(database.result, durationInMillis);
349  } else {
350  String result = recognizer.getPartialResult();
351  VoskPartial database = gson.fromJson(result, partailType);
352  processWords(database.partial_result, durationInMillis);
353  }
354  }
355  VoskResultl database = gson.fromJson(recognizer.getFinalResult(), resultType);
356  recognizer.close();
357  processWords(database.result, durationInMillis);
358  positionInTrack = 100;
359  if (timeCodedVisemes.size() > 0) {
360  TimeCodedViseme tcLast = timeCodedVisemes.get(timeCodedVisemes.size() - 1);
361  // termination sound of nothing
362  TimeCodedViseme tc = new TimeCodedViseme(AudioStatus.X_NO_SOUND, tcLast.end, secLen, secLen);
363  add(tc);
364  }
365  File json = new File(ScriptingEngine.getWorkspace().getAbsolutePath() + "/tmp-tts-visime.json");
366  if (!json.exists()) {
367  json.createNewFile();
368  }
369  String s = gson.toJson(timeCodedVisemesCache);
370  BufferedWriter writer = new BufferedWriter(new FileWriter(json.getAbsolutePath()));
371  writer.write(s);
372  writer.close();
373  System.out.println("Lip Sync data writen to " + json.getAbsolutePath());
374  timeCodedVisemesCache.clear();
375  } catch (Throwable tr) {
376  tr.printStackTrace();
377  }
378  });
379  t.start();
380 
381  while (t.isAlive() && positionInTrack < getPercentageTimeOfLipSyncReadahead()
382  && (System.currentTimeMillis() - start < durationInMillis)) {
383  try {
384  Thread.sleep(1);
385  } catch (InterruptedException e) {
386  break;
387  }
388  }
389  if (t.isAlive()) {
390  t.interrupt();
391  }
392  // println "Visemes added, start audio.. "
393  }
394 
395  public AudioInputStream startProcessing(AudioInputStream ais, String TTSString) {
396  timeCodedVisemes = new ArrayList<>();
397 
398  File audio = new File(ScriptingEngine.getWorkspace().getAbsolutePath() + "/tmp-tts.wav");
399  try {
400  long start = System.currentTimeMillis();
401  System.out.println("Vosk Lip Sync Begin writing..");
402  AudioSystem.write(ais, AudioFileFormat.Type.WAVE, audio);
403  ais = AudioSystem.getAudioInputStream(audio);
404  File text = new File(ScriptingEngine.getWorkspace().getAbsolutePath() + "/tmp-tts.txt");
405  if (!text.exists())
406  text.createNewFile();
407  try {
408  FileWriter myWriter = new FileWriter(text);
409  myWriter.write(TTSString);
410  myWriter.close();
411  } catch (IOException e) {
412  e.printStackTrace();
413  }
414  // rhubarb!
415  processRaw(audio, text.getAbsolutePath());
416  System.out.println("Vosk Lip Sync Done writing! took " + (System.currentTimeMillis() - start));
417  } catch (Exception e) {
418  // TODO Auto-generated catch block
419  e.printStackTrace();
420  }
421 
422  return ais;
423  }
424 
425  public AudioStatus update(AudioStatus current, double amplitudeUnitVector, double currentRollingAverage,
426  double currentDerivitiveTerm, double percent) {
427  // println timeCodedVisemes
428  AudioStatus ret = null;
429  if (timeCodedVisemes.size() > 0) {
430  TimeCodedViseme map = timeCodedVisemes.get(0);
431  AudioStatus key = map.status;
432  double value = map.getEndPercentage();
433  if (percent > value) {
434  timeCodedVisemes.remove(0);
435  if (timeCodedVisemes.size() > 0)
436  ret = timeCodedVisemes.get(0).status;
437  else {
438  // println "\n\nERROR Audio got ahead of lip sync "+percent+"\n\n"
439  ret = AudioStatus.X_NO_SOUND;
440  }
441  } else if (percent > map.getStartPercentage())
442  ret = key;
443  } else {
444  // println "\n\nERROR Audio got ahead of lip sync "+percent+"\n\n"
445  }
446  if (ret == null)
447  ret = current;
448  if (current != ret) {
449  // println ret.toString()+" staarting at "+percent
450  }
451  return ret;
452 
453  }
454 
455  public static String promptFromMicrophone() throws IOException, LineUnavailableException {
456  if (model == null)
457  throw new RuntimeException("Vosk Model failed to load, check "
458  + ScriptingEngine.getWorkspace().getAbsolutePath() + "/" + getModelName());
459  Recognizer recognizer = new Recognizer(model, 120000);
460 
461  DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
462  TargetDataLine microphone;
463  microphone = (TargetDataLine) AudioSystem.getLine(info);
464  microphone.open(format);
465  microphone.start();
466 
467  int numBytesRead;
468  int CHUNK_SIZE = 1024;
469  int bytesRead = 0;
470 
471  byte[] b = new byte[4096];
472  // println "Listening..."
473  String result = null;
474  long start = System.currentTimeMillis();
475  Type STTType = new TypeToken<HashMap<String, String>>() {
476  }.getType();
477  try {
478  while (((System.currentTimeMillis() - start) < 30000) && !Thread.interrupted()) {
479  // Thread.sleep(0,100);
480  numBytesRead = microphone.read(b, 0, CHUNK_SIZE);
481  bytesRead += numBytesRead;
482 
483  if (recognizer.acceptWaveForm(b, numBytesRead)) {
484  result = recognizer.getResult();
485  HashMap<String, String> db = gson.fromJson(result, STTType);
486  result = db.get("text");
487  if (result.length() > 2)
488  break;
489  else {
490  // println "Listening..."
491  }
492  } else {
493  // System.out.println(recognizer.getPartialResult());
494  }
495  }
496  } catch (Throwable t) {
497  t.printStackTrace();
498  }
499  recognizer.close();
500  // System.out.println(result);
501  microphone.close();
502  return result;
503  }
504 
508  public static double getPercentageTimeOfLipSyncReadahead() {
510  }
511 
516  public static void setPercentageTimeOfLipSyncReadahead(double percentageTimeOfLipSyncReadahead) {
517  PercentageTimeOfLipSyncReadahead = percentageTimeOfLipSyncReadahead;
518  }
519 
523  public double getTimeLeadLag() {
524  return timeLeadLag;
525  }
526 
530  public void setTimeLeadLag(double timeLeadLag) {
531  this.timeLeadLag = timeLeadLag;
532  }
533 
534 }
AudioStatus update(AudioStatus current, double amplitudeUnitVector, double currentRollingAverage, double currentDerivitiveTerm, double percent)
void processRaw(File f, String ttsLocation)
static void setPercentageTimeOfLipSyncReadahead(double percentageTimeOfLipSyncReadahead)
void processWords(List< VoskResultWord > wordList, long len)
AudioInputStream startProcessing(AudioInputStream ais, String TTSString)
void addWord(VoskResultWord word, long len)
static File fileFromGit(String remoteURI, String fileInRepo)
static AudioStatus get(char code)
static AudioStatus getFromPhoneme(String code)