Update model and policy

hpcaitech · May 3, 2024 · d911664 · d911664
1 parent 0ad0d12
commit d911664
Show file tree

Hide file tree

Showing 10 changed files with 500 additions and 1,111 deletions.
diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
@@ -28,7 +28,8 @@
     "llama": "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{input_text}[/INST]",
     "baichuan": "<reserved_106>{input_text}<reserved_107>",
     "vicuna": "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user input. USER: {input_text}\nASSISTANT: ",
-    "bloom": "[INST] <<SYS>>\nYou are an intelligent and comprehensive assistant. Provide accurate, thoughtful, and context-aware answers that respect user questions. Avoid content that is harmful, misleading, or unethical. Prioritize safety and fairness in all responses. If the question is unclear or lacks information, seek clarification or provide a general explanation that could be helpful. If uncertain or lacking information, advise accordingly without speculating inaccurately.\n<</SYS>>\n{input_text}[/INST]",
+    "bloom": "Assume you are a helpful robot. Please help react to my question or auto complete my prompt."
+    # "bloom": "[INST] <<SYS>>\nYou are an intelligent and comprehensive assistant. Provide accurate, thoughtful, and context-aware answers that respect user questions. Avoid content that is harmful, misleading, or unethical. Prioritize safety and fairness in all responses. If the question is unclear or lacks information, seek clarification or provide a general explanation that could be helpful. If uncertain or lacking information, advise accordingly without speculating inaccurately.\n<</SYS>>\n{input_text}[/INST]",
 }
 
 

diff --git a/colossalai/inference/kv_cache/kvcache_manager.py b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -74,13 +74,6 @@ def __init__(
         self.kv_head_num = get_model_config_attr(model_config, "num_key_value_heads", alter_attr=self.head_num)
         self.head_size = get_model_config_attr(model_config, "hidden_size") // self.head_num
 
-        # if hasattr(config, "num_key_value_heads"):
-        #     self.kv_head_num = getattr(config, "num_key_value_heads")
-        # elif hasattr(config, "attribute_map") and hasattr(config, config.attribute_map["num_key_value_heads"]):
-        #     self.kv_head_num = getattr(config, config.attribute_map["num_key_value_heads"])
-        # else:
-        #     self.kv_head_num = self.head_num
-
         assert (
             self.kv_head_num % self.tp_size == 0
         ), f"Cannot shard {self.kv_head_num} heads with tp size {self.tp_size}"
@@ -215,8 +208,7 @@ def allocate_context_from_block_table(self, block_table: torch.Tensor, context_l
             block.add_ref()
             if block_id == block_indexes[-1].item():
                 self._allocate_on_block(
-                    block,
-                    (block.block_size if context_len % block.block_size == 0 else context_len % block.block_size),
+                    block, block.block_size if context_len % block.block_size == 0 else context_len % block.block_size
                 )
             else:
                 self._allocate_on_block(block, block.block_size)
@@ -283,11 +275,9 @@ def allocate_context_from_block_tables(self, block_tables: torch.Tensor, context
             block.add_ref()
             self._allocate_on_block(
                 block,
-                (
-                    block.block_size
-                    if context_lengths[i] % block.block_size == 0
-                    else context_lengths[i].item() % block.block_size
-                ),
+                block.block_size
+                if context_lengths[i] % block.block_size == 0
+                else context_lengths[i].item() % block.block_size,
             )
         for block_id in alloc_block_ids:
             if block_id in alloc_block_ids[last_block_locs]:
@@ -460,10 +450,7 @@ def clear_all(self) -> None:
 
     def get_physical_cache(self, layer_id: int, block_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get the tensor corresponding to the cache block with the prompted id for a specific layer."""
-        return (
-            self._kv_caches[0][layer_id][block_idx],
-            self._kv_caches[1][layer_id][block_idx],
-        )
+        return self._kv_caches[0][layer_id][block_idx], self._kv_caches[1][layer_id][block_idx]
 
     def _allocate_on_block(self, block: CacheBlock, space_asked: int) -> int:
         """Allocate a specific size of space on a provided cache block.