{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "*this notebook requires a working PyTorch GPU environment* " ] }, { "cell_type": "markdown", "metadata": { "id": "v5hvo8QWN-a9" }, "source": [ "# OpenAI's Whisper\n", "\n", "...original notebook from https://github.com/fastforwardlabs/whisper-openai/blob/master/WhisperDemo.ipynb\n", "\n", "Speech to text...\n", "\n", "more information at\n", "- https://openai.com/blog/whisper\n", "- https://github.com/openai/whisper\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "# install dependencies\n", "\n", "! pip install git+https://github.com/openai/whisper.git" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "3CqtR2Fi5-vP" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-10-13 09:14:42.948361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "%%capture\n", "# use imports and select cuda\n", "import os\n", "import numpy as np\n", "\n", "try:\n", " import tensorflow \n", "except ImportError:\n", " pass\n", "\n", "import torch\n", "import pandas as pd\n", "import whisper\n", "import torchaudio\n", "\n", "from ipywebrtc import AudioRecorder, CameraStream\n", "from IPython.display import Audio, display\n", "import ipywidgets as widgets\n", "\n", "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 107, "referenced_widgets": [ "09446a03c33742dfa70a9f242f96b3be", "8964df95ded44ee28b7ed225c564ed9b", "823fe8b97ef94aedaed6889ac580c8eb", "5604b41fde3b45dd80b954c6128bccf7", "09e1f5b2de9945aea20f6129b7c82ec9", "35775e7c3c5846a589410566cbec95fa" ] }, "id": "-fFdSBBAGjFk", "outputId": "5894a254-7fe0-4593-fbee-74491cd72b9f" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# manually record using webcam - if u want to use a custom audio file, skip this section\n", "camera = CameraStream(constraints={'audio': True,'video':False})\n", "recorder = AudioRecorder(stream=camera)\n", "recorder" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "EDDgAohMGrCR" }, "outputs": [], "source": [ "# save recording as file and convert to wav\n", "with open('recording.webm', 'wb') as f:\n", " f.write(recorder.audio.value)\n", "!ffmpeg -i recording.webm -ac 1 -f wav my_recording.wav -y -hide_banner -loglevel panic" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "U7p2AoJItnIM" }, "outputs": [], "source": [ "# Whisper is capable of performing transcriptions for many languages (though it performs better for some languages and worse for others.) Whisper is also capable of detecting the input language. \n", "# However, to be on the safe side, we can explicitly tell Whisper which language to expect. \n", "language_options = whisper.tokenizer.TO_LANGUAGE_CODE \n", "language_list = list(language_options.keys())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "8c4d89ec973647d1a46aa471311e037c", "6c5f2af50210411f801f812dc17c389c", "90728ace9c454a3ab05ffb3e0bb664a3", "b50eded046cf4d378b1d71a186995d21" ] }, "id": "dpLnKvlb-vLa", "outputId": "bd012b8a-d413-41a9-834a-6674c0e2928a" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Dropdown(options=('english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portu…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Whisper is also capable of several tasks, including English-only transcription, \n", "# Any-to-English translation, and non-English transcription. \n", "lang_dropdown = widgets.Dropdown(options=language_list, value='english')\n", "output = widgets.Output()\n", "display(lang_dropdown)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "43cdc270c46644b6a0308d3c499601fb", "5f9cd9708efb486c930c1397be9a566c", "20cdb2181c804215b4a1e6006be77734", "fb4b05bd474c409288394cd82d6a9179" ] }, "id": "ilyDW-ALMnke", "outputId": "4a34e1f6-519c-46e5-a00b-05fbe3540e18" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Dropdown(options=('transcribe', 'translate'), value='transcribe')" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "task_dropdown = widgets.Dropdown(options=['transcribe', 'translate'], value='transcribe')\n", "output = widgets.Output()\n", "display(task_dropdown)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_PokfNJtOYNu", "outputId": "227e41ec-b1a5-409d-c3c4-20d4564fb09c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model is English-only and has 71,825,408 parameters.\n" ] } ], "source": [ "# load the model (takes some seconds)\n", "# hint: Whisper comes in five model sizes, \n", "# four of which also have an optimized English-only version. \n", "# This notebook loads \"base\"-sized models (bigger than \"tiny\" but smaller than the others), which require about 1GB of RAM.\n", "\n", "#If you selected English above, the cell below will load the optimized English-only version. Otherwise, it will load the multilingual model.\n", "\n", "if lang_dropdown.value == \"english\":\n", " model = whisper.load_model(\"base.en\")\n", "else:\n", " model = whisper.load_model(\"base\")\n", "print(\n", " f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n", " f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DwOUHau-dkUt", "outputId": "d16eef87-3af5-43e6-b54e-407a2e32f5d8" }, "outputs": [ { "data": { "text/plain": [ "DecodingOptions(task='transcribe', language='english', temperature=0.0, sample_len=None, best_of=None, beam_size=None, patience=None, length_penalty=None, prompt=None, prefix=None, suppress_tokens='-1', suppress_blank=True, without_timestamps=True, max_initial_timestamp=1.0, fp16=True)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set the options\n", "options = whisper.DecodingOptions(language=lang_dropdown.value, task=task_dropdown.value, without_timestamps=True)\n", "options" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "q6S0VvoK0vfq" }, "outputs": [], "source": [ "# choose your audio file\n", "#audio = whisper.load_audio(\"my_recording.wav\")\n", "audio = whisper.load_audio(\"QA-01.mp3\")\n", "audio = whisper.pad_or_trim(audio)\n", "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", "result = model.decode(mel, options)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "Mg64_MWW1uMb", "outputId": "444cfab3-f2bd-4519-9779-3f4aba72d1cc" }, "outputs": [ { "data": { "text/plain": [ "'How many people are there in your family? There are five people in my family. My father, mother, brother, sister, and me. Does your family live in a house or an apartment? We live in a house in the countryside. What does your father do? My father is a doctor. He works at the local hospital. How old is your mother? She is 40 years old, one year younger than my father.'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# print the text\n", "result.text" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# or write it into a text file\n", "\n", "text_file = open(\"output.txt\", \"w\")\n", "text_file.write(result.text)\n", " \n", "#close file\n", "text_file.close()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# close all widgets\n", "from ipywidgets import Widget\n", "Widget.close_all()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# it takes around 2GB memory on GPU, so please clear it\n", "from numba import cuda\n", "device = cuda.get_current_device()\n", "device.reset()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "include_colab_link": true, "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "09446a03c33742dfa70a9f242f96b3be": { "model_module": "jupyter-webrtc", "model_module_version": "~0.6.0", "model_name": "AudioRecorderModel", "state": { "_data_src": "blob:https://nfrp4p17vqk-496ff2e9c6d22116-0-colab.googleusercontent.com/d984436f-3337-406a-97dd-76a1145ef36f", "_dom_classes": [], "_model_module": "jupyter-webrtc", "_model_module_version": "~0.6.0", "_model_name": "AudioRecorderModel", "_view_count": null, "_view_module": "jupyter-webrtc", "_view_module_version": "~0.6.0", "_view_name": "AudioRecorderView", "audio": "IPY_MODEL_8964df95ded44ee28b7ed225c564ed9b", "autosave": false, "codecs": "", "filename": "record", "format": "webm", "layout": "IPY_MODEL_823fe8b97ef94aedaed6889ac580c8eb", "recording": false, "stream": "IPY_MODEL_5604b41fde3b45dd80b954c6128bccf7" } }, "09e1f5b2de9945aea20f6129b7c82ec9": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "20cdb2181c804215b4a1e6006be77734": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "35775e7c3c5846a589410566cbec95fa": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "43cdc270c46644b6a0308d3c499601fb": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DropdownModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DropdownModel", "_options_labels": [ "transcribe", "translate" ], "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "DropdownView", "description": "", "description_tooltip": null, "disabled": false, "index": 0, "layout": "IPY_MODEL_5f9cd9708efb486c930c1397be9a566c", "style": "IPY_MODEL_20cdb2181c804215b4a1e6006be77734" } }, "5604b41fde3b45dd80b954c6128bccf7": { "model_module": "jupyter-webrtc", "model_module_version": "~0.6.0", "model_name": "CameraStreamModel", "state": { "_dom_classes": [], "_model_module": "jupyter-webrtc", "_model_module_version": "~0.6.0", "_model_name": "CameraStreamModel", "_view_count": null, "_view_module": "jupyter-webrtc", "_view_module_version": "~0.6.0", "_view_name": "MediaStreamView", "constraints": { "audio": true, "video": false }, "layout": "IPY_MODEL_35775e7c3c5846a589410566cbec95fa" } }, "5f9cd9708efb486c930c1397be9a566c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6c5f2af50210411f801f812dc17c389c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "823fe8b97ef94aedaed6889ac580c8eb": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8964df95ded44ee28b7ed225c564ed9b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "AudioModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "AudioModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "AudioView", "autoplay": true, "controls": true, "format": "webm", "layout": "IPY_MODEL_09e1f5b2de9945aea20f6129b7c82ec9", "loop": true } }, "8c4d89ec973647d1a46aa471311e037c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DropdownModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DropdownModel", "_options_labels": [ "english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese", "burmese", "valencian", "flemish", "haitian", "letzeburgesch", "pushto", "panjabi", "moldavian", "moldovan", "sinhalese", "castilian" ], "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "DropdownView", "description": "", "description_tooltip": null, "disabled": false, "index": 0, "layout": "IPY_MODEL_6c5f2af50210411f801f812dc17c389c", "style": "IPY_MODEL_90728ace9c454a3ab05ffb3e0bb664a3" } }, "90728ace9c454a3ab05ffb3e0bb664a3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b50eded046cf4d378b1d71a186995d21": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fb4b05bd474c409288394cd82d6a9179": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } } } } }, "nbformat": 4, "nbformat_minor": 4 }