@@ -163,7 +163,6 @@ def action(file_uploader, transformations):
163163 st .balloons ()
164164
165165
166-
167166def recognize (file_path , audio ):
168167 ds = {}
169168 ds ["speech" ] = audio
@@ -172,12 +171,14 @@ def recognize(file_path, audio):
172171 input_values = processor (ds ["speech" ], return_tensors = "pt" , padding = "longest" ).input_values # Batch size 1
173172
174173 # retrieve logits
175- logits = model (input_values ).logits
174+ logits = model (input_values ).logits [ 0 ]
176175
177176 # take argmax and decode
178- predicted_ids = torch .argmax (logits , dim = - 1 )
179- transcription = processor .batch_decode (predicted_ids )
180- return transcription [0 ]
177+ # predicted_ids = torch.argmax(logits, dim=-1)
178+ # transcription = processor.batch_decode(predicted_ids)
179+ transcription = ngram_lm_model .decode (logits .cpu ().detach ().numpy (), beam_width = 500 )
180+
181+ return transcription
181182
182183
183184def main ():
@@ -188,7 +189,19 @@ def main():
188189 "Once you have chosen augmentation techniques, select or upload an audio file\n . "
189190 'Then click "Apply" to start! \n \n '
190191 )
192+
191193 if True :
194+ col1 , col2 , col3 = st .columns ([1 ,9 ,1 ])
195+
196+ with col1 :
197+ st .write ("" )
198+
199+ with col2 :
200+ st .image ("demo/assets/demoo.gif" )
201+
202+ with col3 :
203+ st .write ("" )
204+
192205 st .subheader ("Team members:" )
193206 members = '''
194207 Pham Hung Manh\n
@@ -197,7 +210,9 @@ def main():
197210 Nguyen Nhu Toan\n
198211 Ho Nguyen Khang\n '''
199212 st .markdown (members )
213+
200214 st .success ("Manh Ph" )
215+ # st.sidebar.image("demo/assets/demoo.gif")
201216 st .sidebar .markdown ("Choose the transformations here:" )
202217 gaussian_noise = st .sidebar .checkbox ("GaussianNoise" )
203218 frequency_mask = st .sidebar .checkbox ("FrequencyMask" )
0 commit comments