coqui-ai · JEMeyer · Feb 13, 2024 · Apr 23, 2024 · May 1, 2024 · Jun 26, 2024
diff --git a/.github/workflows/build-and-push-to-ghcr.yml b/.github/workflows/build-and-push-to-ghcr.yml
@@ -7,6 +7,12 @@ jobs:
   build-and-push-to-ghcr-cuda118:
     runs-on: ubuntu-22.04
     steps:
+      -
+        name: Set owner name to lower case
+        run: |
+          echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
+        env:
+          OWNER: '${{ github.repository_owner }}'
       -
         name: Checkout
         uses: actions/checkout@v3
@@ -21,7 +27,7 @@ jobs:
           docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io
 
       - name: 'Remove cache'
-        run: | 
+        run: |
           sudo rm -rf /usr/share/dotnet
           sudo rm -rf /opt/ghc
           sudo rm -rf "/usr/local/share/boost"
@@ -34,8 +40,8 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile
           push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest; type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-${{ github.event.number }}
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-${{ github.event.number }}
 
       - name: Build and Push image Cuda 11.8
         if: github.ref == 'refs/heads/main'
@@ -44,14 +50,20 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile
           push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest
+          tags: ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:latest, ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:main-${{ github.sha }}
           #build-args:
 
   build-and-push-to-ghcr-cuda121:
     runs-on: ubuntu-22.04
     steps:
+      -
+        name: Set owner name to lower case
+        run: |
+          echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
+        env:
+          OWNER: '${{ github.repository_owner }}'
       -
         name: Checkout
         uses: actions/checkout@v3
@@ -66,7 +78,7 @@ jobs:
           docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io
 
       - name: 'Remove cache'
-        run: | 
+        run: |
           sudo rm -rf /usr/share/dotnet
           sudo rm -rf /opt/ghc
           sudo rm -rf "/usr/local/share/boost"
@@ -79,8 +91,8 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile.cuda121
           push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
 
       - name: Build and Push image cuda 12.1
         if: github.ref == 'refs/heads/main'
@@ -89,13 +101,19 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile.cuda121
           push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cuda121
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cuda121
+          tags: ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:latest-cuda121, ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:main-cuda121-${{ github.sha }}
           #build-args:
   build-and-push-to-ghcr-cpu:
     runs-on: ubuntu-22.04
     steps:
+      -
+        name: Set owner name to lower case
+        run: |
+          echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
+        env:
+          OWNER: '${{ github.repository_owner }}'
       -
         name: Checkout
         uses: actions/checkout@v3
@@ -110,7 +128,7 @@ jobs:
           docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io
 
       - name: 'Remove cache'
-        run: | 
+        run: |
           sudo rm -rf /usr/share/dotnet
           sudo rm -rf /opt/ghc
           sudo rm -rf "/usr/local/share/boost"
@@ -123,8 +141,8 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile.cpu
           push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cpu-${{ github.event.number }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cpu; type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-pr-cpu-${{ github.event.number }}
 
       - name: Build and Push image CPU
         if: github.ref == 'refs/heads/main'
@@ -133,7 +151,7 @@ jobs:
           context: "{{defaultContext}}:server"
           file: Dockerfile.cpu
           push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu, ghcr.io/coqui-ai/xtts-streaming-server:main-cpu-${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cpu
+          cache-to: type=registry,ref=ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:cache-latest-cpu
+          tags: ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:latest-cpu, ghcr.io/${{ env.OWNER_LC }}/xtts-streaming-server:main-cpu-${{ github.sha }}
           #build-args:
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # XTTS streaming server
-*Warning: XTTS-streaming-server doesn't support concurrent streaming requests, it's a demo server, not meant for production.*
+*Warning: XTTS-streaming-server is a demo server, not meant for production.*
 
 https://github.com/coqui-ai/xtts-streaming-server/assets/17219561/7220442a-e88a-4288-8a73-608c4b39d06c
 
@@ -81,3 +81,6 @@ $ cd xtts-streaming-server/test
 $ python -m pip install -r requirements.txt
 $ python test_streaming.py
 ```
+
+### Forked Repos
+If forked, GitHub action will  automatically build and push a Docker image to your container registry - so it will be ghcr.io/yourusername/xtts-streaming-server.
diff --git a/server/main.py b/server/main.py
@@ -5,6 +5,7 @@
 import wave
 import torch
 import numpy as np
+import threading
 from typing import List
 from pydantic import BaseModel
 
@@ -19,7 +20,7 @@
 torch.set_num_threads(int(os.environ.get("NUM_THREADS", os.cpu_count())))
 device = torch.device("cuda" if os.environ.get("USE_CPU", "0") == "0" else "cpu")
 if not torch.cuda.is_available() and device == "cuda":
-    raise RuntimeError("CUDA device unavailable, please use Dockerfile.cpu instead.") 
+    raise RuntimeError("CUDA device unavailable, please use Dockerfile.cpu instead.")
 
 custom_model_path = os.environ.get("CUSTOM_MODEL_PATH", "/app/tts_models")
 
@@ -44,6 +45,9 @@
 
 print("Running XTTS Server ...", flush=True)
 
+lock = threading.Lock()  # Create a lock object
+print("Establishing lock ...", flush=True)
+
 ##### Run fastapi #####
 app = FastAPI(
     title="XTTS Streaming server",
@@ -52,20 +56,20 @@
     docs_url="/",
 )
 
-
 @app.post("/clone_speaker")
 def predict_speaker(wav_file: UploadFile):
-    """Compute conditioning inputs from reference audio file."""
-    temp_audio_name = next(tempfile._get_candidate_names())
-    with open(temp_audio_name, "wb") as temp, torch.inference_mode():
-        temp.write(io.BytesIO(wav_file.file.read()).getbuffer())
-        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
-            temp_audio_name
-        )
-    return {
-        "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
-        "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
-    }
+    with lock:
+        """Compute conditioning inputs from reference audio file."""
+        temp_audio_name = next(tempfile._get_candidate_names())
+        with open(temp_audio_name, "wb") as temp, torch.inference_mode():
+            temp.write(io.BytesIO(wav_file.file.read()).getbuffer())
+            gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
+                temp_audio_name
+            )
+        return {
+            "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
+            "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
+        }
 
 
 def postprocess(wav):
@@ -137,10 +141,11 @@ def predict_streaming_generator(parsed_input: dict = Body(...)):
 
 @app.post("/tts_stream")
 def predict_streaming_endpoint(parsed_input: StreamingInputs):
-    return StreamingResponse(
-        predict_streaming_generator(parsed_input),
-        media_type="audio/wav",
-    )
+    with lock:
+        return StreamingResponse(
+            predict_streaming_generator(parsed_input),
+            media_type="audio/wav",
+        )
 
 class TTSInputs(BaseModel):
     speaker_embedding: List[float]
@@ -150,36 +155,39 @@ class TTSInputs(BaseModel):
 
 @app.post("/tts")
 def predict_speech(parsed_input: TTSInputs):
-    speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
-    gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
-    text = parsed_input.text
-    language = parsed_input.language
-
-    out = model.inference(
-        text,
-        language,
-        gpt_cond_latent,
-        speaker_embedding,
-    )
+    with lock:
+        speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
+        gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
+        text = parsed_input.text
+        language = parsed_input.language
+
+        out = model.inference(
+            text,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+        )
 
-    wav = postprocess(torch.tensor(out["wav"]))
+        wav = postprocess(torch.tensor(out["wav"]))
 
-    return encode_audio_common(wav.tobytes())
+        return encode_audio_common(wav.tobytes())
 
 
 @app.get("/studio_speakers")
 def get_speakers():
-    if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"):
-        return {
-            speaker: {
-                "speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(),
-                "gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(),
+    with lock:
+        if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"):
+            return {
+                speaker: {
+                    "speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(),
+                    "gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(),
+                }
+                for speaker in model.speaker_manager.speakers.keys()
             }
-            for speaker in model.speaker_manager.speakers.keys()
-        }
-    else:
-        return {}
-
+        else:
+            return {}
+
 @app.get("/languages")
 def get_languages():
-    return config.languages
+    with lock:
+        return config.languages