From 09d28e7c4486ef30ed524483e7b872a3913d81e8 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Thu, 16 Nov 2023 12:30:26 +0100 Subject: [PATCH] Update to sentence spliting (#6) * Update to sentence spliting * Fix CI * Fix cloning --- .github/workflows/build-and-push-to-ghcr.yml | 47 +------------------- README.md | 6 +-- server/Dockerfile | 3 +- server/Dockerfile.cuda118 | 22 --------- server/main.py | 21 +++++---- server/requirements.txt | 2 +- test/test_streaming.py | 27 +++++------ 7 files changed, 34 insertions(+), 94 deletions(-) delete mode 100644 server/Dockerfile.cuda118 diff --git a/.github/workflows/build-and-push-to-ghcr.yml b/.github/workflows/build-and-push-to-ghcr.yml index 4a7bdca..c023672 100644 --- a/.github/workflows/build-and-push-to-ghcr.yml +++ b/.github/workflows/build-and-push-to-ghcr.yml @@ -4,7 +4,7 @@ on: branches: [main] pull_request: jobs: - build-and-push-to-ghcr-cuda117: + build-and-push-to-ghcr-cuda118: runs-on: ubuntu-22.04 steps: - @@ -49,51 +49,6 @@ jobs: tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }} #build-args: - build-and-push-to-ghcr-cuda118: - runs-on: ubuntu-22.04 - steps: - - - name: Checkout - uses: actions/checkout@v3 - - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: 'Login to GitHub Container Registry' - run: | - set -xe - docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io - - - name: 'Remove cache' - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - - name: Build only for PR cuda 11.8 - if: github.ref != 'refs/heads/main' - uses: docker/build-push-action@v5 - with: - context: "{{defaultContext}}:server" - file: Dockerfile.cuda118 - push: false # Do not push image for PR - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }} - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }} - - - name: Build and Push image cuda 11.8 - if: github.ref == 'refs/heads/main' - uses: docker/build-push-action@v5 - with: - context: "{{defaultContext}}:server" - file: Dockerfile.cuda118 - push: true # Push if merged - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118 - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118 - tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }} - #build-args: - build-and-push-to-ghcr-cuda121: runs-on: ubuntu-22.04 steps: diff --git a/README.md b/README.md index 886931d..8ffb730 100644 --- a/README.md +++ b/README.md @@ -32,16 +32,16 @@ $ python test_streaming.py ## Building the container -1. To build the Docker container (Pytorch 2.01 Cuda 11.7) : +1. To build the Docker container Pytorch 2.1 and CUDA 11.8 : ```bash $ cd server $ docker build -t xtts-stream . ``` -For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers) +For Pytorch 2.1 and CUDA 12.1 : ```bash $ cd server -# docker build -t xtts-stream . -f Dockerfile.cuda118 +docker build -t xtts-stream . -f Dockerfile.cuda121 ``` 2. Run the server container: diff --git a/server/Dockerfile b/server/Dockerfile index 585f991..212d98b 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -1,4 +1,4 @@ -FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel +FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ @@ -13,6 +13,7 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \ RUN python -m unidic download COPY main.py . +ENV NVIDIA_DISABLE_REQUIRE=1 ENV NUM_THREADS=2 EXPOSE 80 diff --git a/server/Dockerfile.cuda118 b/server/Dockerfile.cuda118 deleted file mode 100644 index cdb7898..0000000 --- a/server/Dockerfile.cuda118 +++ /dev/null @@ -1,22 +0,0 @@ -FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel -ARG DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && \ - apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \ - apt-get clean && apt-get -y autoremove - -WORKDIR /app -COPY requirements.txt . -RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \ - && python -m pip cache purge - -RUN python -m unidic download - -COPY main.py . - -#Mark this 1 if you have older card -ENV NVIDIA_DISABLE_REQUIRE=1 - -ENV NUM_THREADS=2 -EXPOSE 80 -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/server/main.py b/server/main.py index 1bbb9ee..dfe56b8 100644 --- a/server/main.py +++ b/server/main.py @@ -54,7 +54,7 @@ def predict_speaker(wav_file: UploadFile): temp_audio_name = next(tempfile._get_candidate_names()) with open(temp_audio_name, "wb") as temp, torch.inference_mode(): temp.write(io.BytesIO(wav_file.file.read()).getbuffer()) - gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents( + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( temp_audio_name ) return { @@ -110,12 +110,13 @@ class StreamingInputs(BaseModel): "nl", "cs", "ar", - "zh-cn", + "zh", "ja", + "hu", + "ko", ] add_wav_header: bool = True stream_chunk_size: str = "20" - decoder: str = "ne_hifigan" def predict_streaming_generator(parsed_input: dict = Body(...)): @@ -127,16 +128,20 @@ def predict_streaming_generator(parsed_input: dict = Body(...)): ) text = parsed_input.text language = parsed_input.language - decoder = parsed_input.decoder - - if decoder not in ["ne_hifigan","hifigan"]: - decoder = "ne_hifigan" stream_chunk_size = int(parsed_input.stream_chunk_size) add_wav_header = parsed_input.add_wav_header - chunks = model.inference_stream(text, language, gpt_cond_latent, speaker_embedding, decoder=decoder,stream_chunk_size=stream_chunk_size) + chunks = model.inference_stream( + text, + language, + gpt_cond_latent, + speaker_embedding, + stream_chunk_size=stream_chunk_size, + enable_text_splitting=True + ) + for i, chunk in enumerate(chunks): chunk = postprocess(chunk) if i == 0 and add_wav_header: diff --git a/server/requirements.txt b/server/requirements.txt index 4e4d41a..bf45273 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -1,4 +1,4 @@ -TTS==0.20.2 +TTS @ git+https://github.com/coqui-ai/TTS@sentence_spliting uvicorn[standard]==0.23.2 fastapi==0.95.2 deepspeed==0.10.3 diff --git a/test/test_streaming.py b/test/test_streaming.py index 3e35409..8fe8dc5 100644 --- a/test/test_streaming.py +++ b/test/test_streaming.py @@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True): ffplay_proc.wait() -def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]: +def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]: start = time.perf_counter() speaker["text"] = text speaker["language"] = language - speaker["decoder"] = decoder # "hifigan" or "ne_hifigan" for TTS>0.19.0 speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality res = requests.post( f"{server_url}/tts_stream", @@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url): default="en", help="Language to use default is 'en' (English)" ) - parser.add_argument( "--output_file", default=None, @@ -102,18 +100,11 @@ def get_speaker(ref_audio,server_url): default="http://localhost:8000", help="Server url http://localhost:8000 default, change to your server location " ) - parser.add_argument( - "--decoder", - default="ne_hifigan", - help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan" - ) - parser.add_argument( "--stream_chunk_size", default="20", help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality" ) - args = parser.parse_args() with open("./default_speaker.json", "r") as file: @@ -121,6 +112,16 @@ def get_speaker(ref_audio,server_url): if args.ref_file is not None: print("Computing the latents for a new reference...") - speaker = get_speaker(args.ref_file,args.server_url) - - audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file)) + speaker = get_speaker(args.ref_file, args.server_url) + + audio = stream_ffplay( + tts( + args.text, + speaker, + args.language, + args.server_url, + args.stream_chunk_size + ), + args.output_file, + save=bool(args.output_file) + )