Skip to content

Commit ddfb8f1

Browse files
authored
Merge pull request llmware-ai#712 from llmware-ai/gguf-streaming-interface
adding gguf real time streaming method and examples
2 parents 73f8e8d + ff4fa6f commit ddfb8f1

File tree

4 files changed

+249
-0
lines changed

4 files changed

+249
-0
lines changed

‎examples/Models/gguf_streaming.py‎

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
""" This example illustrates how to use the stream method for GGUF models for fast streaming of inference,
3+
especially for real-time chat interactions.
4+
5+
Please note that the stream method has been implemented for GGUF models starting in llmware-0.2.13. This will be
6+
any model with GGUFGenerativeModel class, and generally includes models with names that end in "gguf".
7+
8+
See also the chat UI example in the UI examples folder.
9+
10+
We would recommend using a chat optimized model, and have included a representative list below.
11+
12+
13+
"""
14+
15+
16+
from llmware.models import ModelCatalog
17+
from llmware.gguf_configs import GGUFConfigs
18+
19+
# sets an absolute output maximum for the GGUF engine - normally set by default at 256
20+
GGUFConfigs().set_config("max_output_tokens", 1000)
21+
22+
chat_models = ["phi-3-gguf",
23+
"llama-2-7b-chat-gguf",
24+
"llama-3-instruct-bartowski-gguf",
25+
"openhermes-mistral-7b-gguf",
26+
"zephyr-7b-gguf",
27+
"tiny-llama-chat-gguf"]
28+
29+
model_name = chat_models[0]
30+
31+
# maximum output can be set optionally at any number up to the "max_output_tokens" set
32+
model = ModelCatalog().load_model(model_name, max_output=200)
33+
34+
text_out = ""
35+
36+
token_count = 0
37+
38+
prompt = "I am interested in gaining an understanding of the banking industry. What topics should I research?"
39+
40+
# since model.stream provides a generator, then use as follows to consume the generator
41+
42+
for streamed_token in model.stream(prompt):
43+
44+
text_out += streamed_token
45+
if text_out.strip():
46+
print(streamed_token, end="")
47+
48+
token_count += 1
49+
50+
# final output text and token count
51+
52+
print("\n\n***total text out***: ", text_out)
53+
print("\n***total tokens***: ", token_count)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
2+
""" This example shows how to build a local chatbot prototype using llmware and Streamlit. The example shows
3+
how to use several GGUF chat models in the LLMWare catalog, along with using the model.stream method which
4+
provides a real time generator for displaying the bot response in real-time.
5+
6+
This is purposefully super simple script (but surprisingly fun) to provide the core of the recipe.
7+
8+
The Streamlit code below is derived from Streamlit tutorials available at:
9+
https://docs.streamlit.io/develop/tutorials/llms/build-conversational-apps
10+
11+
If you are new to using Steamlit, to run this example:
12+
13+
1. pip3 install streamlit
14+
15+
2. to run, go to the command line: streamlit run "path/to/gguf_streaming_chatbot.py"
16+
17+
"""
18+
19+
import streamlit as st
20+
from llmware.models import ModelCatalog
21+
from llmware.gguf_configs import GGUFConfigs
22+
23+
GGUFConfigs().set_config("max_output_tokens", 500)
24+
25+
26+
def simple_chat_ui_app (model_name):
27+
28+
st.title(f"Simple Chat with {model_name}")
29+
30+
model = ModelCatalog().load_model(model_name, temperature=0.3, sample=True, max_output=450)
31+
32+
# initialize chat history
33+
if "messages" not in st.session_state:
34+
st.session_state.messages = []
35+
36+
# display chat messages from history on app rerun
37+
for message in st.session_state.messages:
38+
with st.chat_message(message["role"]):
39+
st.markdown(message["content"])
40+
41+
# accept user input
42+
prompt = st.chat_input("Say something")
43+
if prompt:
44+
45+
with st.chat_message("user"):
46+
st.markdown(prompt)
47+
48+
with st.chat_message("assistant"):
49+
50+
# note that the st.write_stream method consumes a generator - so pass model.stream(prompt) directly
51+
bot_response = st.write_stream(model.stream(prompt))
52+
53+
st.session_state.messages.append({"role": "user", "content": prompt})
54+
st.session_state.messages.append({"role": "assistant", "content": bot_response})
55+
56+
return 0
57+
58+
59+
if __name__ == "__main__":
60+
61+
# a few representative good chat models that can run locally
62+
# note: will take a minute for the first time it is downloaded and cached locally
63+
64+
chat_models = ["phi-3-gguf",
65+
"llama-2-7b-chat-gguf",
66+
"llama-3-instruct-bartowski-gguf",
67+
"openhermes-mistral-7b-gguf",
68+
"zephyr-7b-gguf",
69+
"tiny-llama-chat-gguf"]
70+
71+
model_name = chat_models[0]
72+
73+
simple_chat_ui_app(model_name)
74+
75+
76+

‎llmware/model_configs.py‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,15 @@
554554
"link": "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF",
555555
"custom_model_files": [], "custom_model_repo": ""},
556556

557+
{"model_name": "tiny-llama-chat-gguf", "display_name": "tiny-llama-chat-gguf",
558+
"model_family": "GGUFGenerativeModel", "model_category": "generative_local", "model_location": "llmware_repo",
559+
"context_window": 2048, "instruction_following": False, "prompt_wrapper": "hf_chat",
560+
"temperature": 0.3, "sample_default": True, "trailing_space": "",
561+
"gguf_file": "tiny-llama-chat.gguf",
562+
"gguf_repo": "llmware/bonchon",
563+
"link": "https://huggingface.co/llmware/bonchon",
564+
"custom_model_files": [], "custom_model_repo": ""},
565+
557566
# end - new llama-3 quantized models
558567

559568
# whisper-cpp models

‎llmware/models.py‎

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6031,6 +6031,117 @@ def function_call(self, context, function=None, params=None, get_logits=True,
60316031

60326032
return output_response
60336033

6034+
def stream(self, prompt, add_context=None, add_prompt_engineering=None, api_key=None, inference_dict=None,
6035+
get_logits=False, disable_eos=False):
6036+
6037+
""" Main method for text streaming generation. Returns a generator function that yields one
6038+
token at a time for real-time streaming to console or UI. """
6039+
6040+
# first prepare the prompt
6041+
6042+
if add_context:
6043+
self.add_context = add_context
6044+
6045+
if add_prompt_engineering:
6046+
self.add_prompt_engineering = add_prompt_engineering
6047+
6048+
# update default handling for no add_prompt_engineering
6049+
6050+
if not self.add_prompt_engineering:
6051+
if self.add_context:
6052+
self.add_prompt_engineering = "default_with_context"
6053+
else:
6054+
self.add_prompt_engineering = "default_no_context"
6055+
6056+
# end - update
6057+
6058+
# show warning if function calling model
6059+
if self.fc_supported:
6060+
logging.warning("warning: this is a function calling model - using .inference may lead to unexpected "
6061+
"results. Recommended to use the .function_call method to ensure correct prompt "
6062+
"template packaging.")
6063+
6064+
# start with clean logits_record and output_tokens for each function call
6065+
self.logits_record = []
6066+
self.output_tokens = []
6067+
6068+
if get_logits:
6069+
self.get_logits = get_logits
6070+
6071+
if inference_dict:
6072+
6073+
if "temperature" in inference_dict:
6074+
self.temperature = inference_dict["temperature"]
6075+
6076+
if "max_tokens" in inference_dict:
6077+
self.target_requested_output_tokens = inference_dict["max_tokens"]
6078+
6079+
# prompt = prompt
6080+
6081+
if self.add_prompt_engineering:
6082+
prompt_enriched = self.prompt_engineer(prompt, self.add_context, inference_dict=inference_dict)
6083+
prompt_final = prompt_enriched
6084+
6085+
# most models perform better with no trailing space or line-break at the end of prompt
6086+
# -- in most cases, the trailing space will be ""
6087+
# -- yi model prefers a trailing "\n"
6088+
# -- keep as parameterized option to maximize generation performance
6089+
# -- can be passed either thru model_card or model config from HF
6090+
6091+
prompt = prompt_final + self.trailing_space
6092+
6093+
# output_response = self._inference(text_prompt)
6094+
6095+
# starts _inference here
6096+
completion_tokens = [] if len(prompt) > 0 else [self.token_bos()]
6097+
6098+
prompt_tokens = (
6099+
(
6100+
self.tokenize(prompt.encode("utf-8"), special=True)
6101+
if prompt != ""
6102+
else [self.token_bos()]
6103+
)
6104+
if isinstance(prompt, str)
6105+
else prompt
6106+
)
6107+
6108+
# confirm that input is smaller than context_window
6109+
input_len = len(prompt_tokens)
6110+
context_window = self.n_ctx()
6111+
6112+
if input_len > context_window:
6113+
logging.warning("update: GGUFGenerativeModel - input is too long for model context window - truncating")
6114+
min_output_len = 10
6115+
prompt_tokens = prompt_tokens[0:context_window-min_output_len]
6116+
input_len = len(prompt_tokens)
6117+
6118+
text = b""
6119+
6120+
# disable_eos = True
6121+
6122+
for token in self.generate(prompt_tokens):
6123+
6124+
completion_tokens.append(token)
6125+
6126+
if not disable_eos:
6127+
if token == self._token_eos:
6128+
break
6129+
6130+
if len(completion_tokens) > self.max_output_len:
6131+
break
6132+
6133+
# stop if combined input + output at context window size
6134+
if (input_len + len(completion_tokens)) >= context_window:
6135+
break
6136+
6137+
new_token = self.detokenize([token]).decode('utf-8',errors='ignore')
6138+
6139+
yield new_token
6140+
6141+
text_str = text.decode("utf-8", errors="ignore")
6142+
6143+
return text_str
6144+
60346145

60356146
class WhisperCPPModel:
60366147

0 commit comments

Comments
 (0)