Lightning-AI · bhimrazy · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
@@ -236,6 +236,7 @@ def run_streaming_loop(
     lit_spec: LitSpec,
     request_queue: Queue,
     response_queues: List[Queue],
+    request_evicted_status: Dict[str, bool],
     callback_runner: CallbackRunner,
 ):
     while True:
@@ -279,7 +280,11 @@ def run_streaming_loop(
                 lit_api.encode_response,
                 y_gen,
             )
-            for y_enc in y_enc_gen:
+            check_interval = 50
+            for index, y_enc in enumerate(y_enc_gen):
+                if index % check_interval == 0 and request_evicted_status.get(uid):
+                    request_evicted_status.pop(uid)
+                    break
                 y_enc = lit_api.format_encoded_response(y_enc)
                 response_queues[response_queue_id].put((uid, (y_enc, LitAPIStatus.OK)))
             response_queues[response_queue_id].put((uid, ("", LitAPIStatus.FINISH_STREAMING)))
@@ -377,6 +382,7 @@ def inference_worker(
     batch_timeout: float,
     stream: bool,
     workers_setup_status: Dict[str, bool],
+    request_evicted_status: Dict[str, bool],
     callback_runner: CallbackRunner,
 ):
     callback_runner.trigger_event(EventTypes.BEFORE_SETUP, lit_api=lit_api)
@@ -397,7 +403,9 @@ def inference_worker(
                 lit_api, lit_spec, request_queue, response_queues, max_batch_size, batch_timeout, callback_runner
             )
         else:
-            run_streaming_loop(lit_api, lit_spec, request_queue, response_queues, callback_runner)
+            run_streaming_loop(
+                lit_api, lit_spec, request_queue, response_queues, request_evicted_status, callback_runner
+            )
         return
 
     if max_batch_size > 1:

@@ -89,7 +89,7 @@ async def response_queue_to_buffer(
                 await asyncio.sleep(0.0001)
                 continue
             stream_response_buffer, event = response_buffer[uid]
-            stream_response_buffer.append(response)
+            stream_response_buffer.append((uid, response))
             event.set()
 
     else:
@@ -206,6 +206,7 @@ def launch_inference_worker(self, num_uvicorn_servers: int):
         manager = mp.Manager()
         self.workers_setup_status = manager.dict()
         self.request_queue = manager.Queue()
+        self.request_evicted_status = manager.dict()
 
         self.response_queues = [manager.Queue() for _ in range(num_uvicorn_servers)]
 
@@ -237,6 +238,7 @@ def launch_inference_worker(self, num_uvicorn_servers: int):
                     self.batch_timeout,
                     self.stream,
                     self.workers_setup_status,
+                    self.request_evicted_status,
                     self._callback_runner,
                 ),
             )
@@ -272,26 +274,32 @@ def device_identifiers(self, accelerator, device):
         return [f"{accelerator}:{device}"]
 
     async def data_streamer(self, q: deque, data_available: asyncio.Event, send_status: bool = False):
+        uid = None
         while True:
-            await data_available.wait()
-            while len(q) > 0:
-                data, status = q.popleft()
-                if status == LitAPIStatus.FINISH_STREAMING:
-                    return
-
-                if status == LitAPIStatus.ERROR:
-                    logger.error(
-                        "Error occurred while streaming outputs from the inference worker. "
-                        "Please check the above traceback."
-                    )
+            try:
+                await data_available.wait()
+                while len(q) > 0:
+                    uid, (data, status) = q.popleft()
+                    if status == LitAPIStatus.FINISH_STREAMING:
+                        return
+                    if status == LitAPIStatus.ERROR:
+                        logger.error(
+                            "Error occurred while streaming outputs from the inference worker. "
+                            "Please check the above traceback."
+                        )
+                        if send_status:
+                            yield data, status
+                        return
                     if send_status:
                         yield data, status
-                    return
-                if send_status:
-                    yield data, status
-                else:
-                    yield data
-            data_available.clear()
+                    else:
+                        yield data
+                data_available.clear()
+            except asyncio.CancelledError:
+                if uid is not None:
+                    self.request_evicted_status[uid] = True
+                    logger.exception("Streaming request cancelled for the uid=%s", uid)
+                return
 
     def register_endpoints(self):
         """Register endpoint routes for the FastAPI app and setup middlewares."""

@@ -55,6 +55,14 @@ def encode_response(self, output: Generator) -> Generator:
             yield out.lower()
 
 
+class SimpleDelayedStreamAPI(SimpleStreamAPI):
+    def encode_response(self, output: Generator) -> Generator:
+        delay = 0.2
+        for out in output:
+            time.sleep(delay)
+            yield out.lower()
+
+
 class SimpleBatchedStreamAPI(LitAPI):
     def setup(self, device) -> None:
         self.sentence = "LitServe is streaming output"
@@ -98,6 +106,11 @@ def simple_batched_stream_api():
     return SimpleBatchedStreamAPI()
 
 
+@pytest.fixture
+def simple_delayed_stream_api():
+    return SimpleDelayedStreamAPI()
+
+
 @pytest.fixture
 def lit_server(simple_litapi):
     server = LitServer(simple_litapi, accelerator="cpu", devices=1, timeout=10)

@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
 import pickle
+import logging
 import re
 import sys
 
@@ -81,6 +82,27 @@ async def test_stream(simple_stream_api):
             ), "Server returns input prompt and generated output which didn't match."
 
 
+@pytest.mark.asyncio
+async def test_stream_client_disconnection(simple_delayed_stream_api, caplog):
+    server = LitServer(simple_delayed_stream_api, stream=True, timeout=10)
+
+    with wrap_litserve_start(server) as server, caplog.at_level(logging.DEBUG):
+        async with LifespanManager(server.app) as manager, AsyncClient(app=manager.app, base_url="http://test") as ac:
+            task = asyncio.create_task(ac.post("/predict", json={"prompt": "Hey, How are you doing?"}, timeout=10))
+            await asyncio.sleep(2)
+            task.cancel()  # simulate client disconnection
+            await asyncio.sleep(1)  # wait for the task to stop
+            with pytest.raises(asyncio.CancelledError):
+                await task
+            assert "Streaming request cancelled for the uid=" in caplog.text
+            # TODO: also check if the task actually stopped in the server
+
+            caplog.clear()
+            task = asyncio.create_task(ac.post("/predict", json={"prompt": "Hey, How are you doing?"}, timeout=10))
+            await task
+            assert "Streaming request cancelled for the uid=" not in caplog.text
+
+
 @pytest.mark.asyncio
 async def test_batched_stream_server(simple_batched_stream_api):
     server = LitServer(simple_batched_stream_api, stream=True, max_batch_size=4, batch_timeout=2, timeout=30)

@@ -99,10 +99,16 @@ def fake_encode(output):
     requests_queue = Queue()
     requests_queue.put((0, "UUID-1234", time.monotonic(), {"prompt": "Hello"}))
     response_queues = [FakeStreamResponseQueue(num_streamed_outputs)]
+    request_evicted_status = {}
 
     with pytest.raises(StopIteration, match="exit loop"):
         run_streaming_loop(
-            fake_stream_api, fake_stream_api, requests_queue, response_queues, callback_runner=NOOP_CB_RUNNER
+            fake_stream_api,
+            fake_stream_api,
+            requests_queue,
+            response_queues,
+            request_evicted_status,
+            callback_runner=NOOP_CB_RUNNER,
         )
 
     fake_stream_api.predict.assert_called_once_with("Hello")
@@ -182,6 +188,7 @@ def test_inference_worker(mock_single_loop, mock_batched_loop):
         batch_timeout=0,
         stream=False,
         workers_setup_status={},
+        request_evicted_status={},
         callback_runner=NOOP_CB_RUNNER,
     )
     mock_batched_loop.assert_called_once()
@@ -192,6 +199,7 @@ def test_inference_worker(mock_single_loop, mock_batched_loop):
         batch_timeout=0,
         stream=False,
         workers_setup_status={},
+        request_evicted_status={},
         callback_runner=NOOP_CB_RUNNER,
     )
     mock_single_loop.assert_called_once()
@@ -322,10 +330,12 @@ def test_run_streaming_loop():
     request_queue = Queue()
     request_queue.put((0, "UUID-001", time.monotonic(), {"input": "Hello"}))
     response_queues = [Queue()]
+    request_evicted_status = {}
 
     # Run the loop in a separate thread to allow it to be stopped
     loop_thread = threading.Thread(
-        target=run_streaming_loop, args=(lit_api, None, request_queue, response_queues, NOOP_CB_RUNNER)
+        target=run_streaming_loop,
+        args=(lit_api, None, request_queue, response_queues, request_evicted_status, NOOP_CB_RUNNER),
     )
     loop_thread.start()
 
@@ -350,10 +360,12 @@ def test_run_streaming_loop_timeout(caplog):
     request_queue = Queue()
     request_queue.put((0, "UUID-001", time.monotonic() - 5, {"input": "Hello"}))
     response_queues = [Queue()]
+    request_evicted_status = {}
 
     # Run the loop in a separate thread to allow it to be stopped
     loop_thread = threading.Thread(
-        target=run_streaming_loop, args=(lit_api, None, request_queue, response_queues, NOOP_CB_RUNNER)
+        target=run_streaming_loop,
+        args=(lit_api, None, request_queue, response_queues, request_evicted_status, NOOP_CB_RUNNER),
     )
     loop_thread.start()