Skip to content

Commit bcf63f3

Browse files
authored
Merge pull request llmware-ai#991 from llmware-ai/update-model-configs
updating model config and prompt wrappers
2 parents 050c128 + e70f5d4 commit bcf63f3

File tree

3 files changed

+94
-3
lines changed

3 files changed

+94
-3
lines changed

‎llmware/model_configs.py‎

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,8 +1737,26 @@
17371737
"system_stop": "<|eot_id|>",
17381738
"main_start": "<|start_header_id|>user>|end_header_id|>\n",
17391739
"main_stop": "<|eot_id|>",
1740-
"start_llm_response": "<|start_header_id|>assistant<|end_header_id|>\n"}
1741-
}
1740+
"start_llm_response": "<|start_header_id|>assistant<|end_header_id|>\n"},
1741+
1742+
"tiny_llama_chat": {"system_start": "<|system|>", "system_stop": "</s>",
1743+
"main_start": "<|user|>", "main_stop": "</s>",
1744+
"start_llm_response": "<|assistant|>"},
1745+
1746+
"stablelm_zephyr_chat": {"system_start": "", "system_stop": "",
1747+
"main_start": "<|user|>", "main_stop": "<|endoftext|>\n",
1748+
"start_llm_response": "<|assistant|>"},
1749+
1750+
"google_gemma_chat": {"system_start": "", "system_stop": "",
1751+
"main_start": "<bos><start_of_turn>user\n",
1752+
"main_stop": "<end_of_turn>\n",
1753+
"start_llm_response": "<start_of_turn>model"},
1754+
1755+
"vicuna_chat": {"system_start": "", "system_stop": "",
1756+
"main_start": "USER: ", "main_stop": "",
1757+
"start_llm_response": " ASSISTANT:"}
1758+
1759+
}
17421760

17431761
""" Global default prompt catalog consists of a set of prebuilt useful prompt instructions across a wide range
17441762
of models. Unlike prompt_wrappers, which tend to be an attribute of the model, the prompt catalog can be invoked

‎llmware/models.py‎

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ class _ModelRegistry:
8989
# most fine-tuned models require a specific prompt wrapping that was used in the fine-tuning process
9090
# we are treating these "prompt_wrappers" as core attributes of the model
9191
prompt_wrappers = ["alpaca", "human_bot", "chatgpt", "<INST>", "open_chat", "hf_chat", "chat_ml", "phi_3",
92-
"llama_3_chat"]
92+
"llama_3_chat","tiny_llama_chat","stablelm_zephyr_chat", "google_gemma_chat",
93+
"vicuna_chat"]
9394

9495
registered_wrappers = global_model_finetuning_prompt_wrappers_lookup
9596

@@ -175,8 +176,11 @@ def validate(cls, model_card_dict):
175176
if "model_family" not in model_card_dict:
176177
return False
177178

179+
# removing this condition from validation - provides more extensibility in creating new model classes
180+
"""
178181
if model_card_dict["model_family"] not in cls.model_classes:
179182
return False
183+
"""
180184

181185
if "prompt_wrapper" in model_card_dict:
182186

‎llmware/util.py‎

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,75 @@ def find_match(self, key_term, sentence):
748748

749749
return matches_found
750750

751+
def locate_query_match(self,query, core_text):
752+
753+
""" Utility function to locate the character-level match of a query inside a core_text. """
754+
755+
matches_found = []
756+
757+
# edge case - but return empty match if query is null
758+
if not query:
759+
return matches_found
760+
761+
b = CorpTokenizer(one_letter_removal=False, remove_stop_words=False, remove_punctuation=False,
762+
remove_numbers=False)
763+
764+
query_tokens = b.tokenize(query)
765+
766+
for x in range(0, len(core_text)):
767+
match = 0
768+
for key_term in query_tokens:
769+
if len(key_term) == 0:
770+
continue
771+
772+
if key_term.startswith('"'):
773+
key_term = key_term[1:-1]
774+
775+
if core_text[x].lower() == key_term[0].lower():
776+
match += 1
777+
if (x + len(key_term)) <= len(core_text):
778+
for y in range(1, len(key_term)):
779+
if key_term[y].lower() == core_text[x + y].lower():
780+
match += 1
781+
else:
782+
match = -1
783+
break
784+
785+
if match == len(key_term):
786+
new_entry = [x, key_term]
787+
matches_found.append(new_entry)
788+
789+
return matches_found
790+
791+
def highlighter(self,matches, core_string, highlight_start_token="<b>",
792+
highlight_end_token="</b>"):
793+
794+
""" Utility function to 'highlight' a selected token, based on matches, typically found
795+
in locate_query_match function - useful for visual display of a matching keyword. """
796+
797+
# assumes by default:
798+
# highlight_start_token = "<b>"
799+
# highlight_end_token = "</b>"
800+
801+
updated_string = ""
802+
cursor_position = 0
803+
804+
for mat in matches:
805+
starter = mat[0]
806+
keyword = mat[1]
807+
808+
updated_string += core_string[cursor_position:starter]
809+
updated_string += highlight_start_token
810+
updated_string += keyword
811+
updated_string += highlight_end_token
812+
813+
cursor_position = starter + len(keyword)
814+
815+
if cursor_position < len(core_string):
816+
updated_string += core_string[cursor_position:]
817+
818+
return updated_string
819+
751820
def package_answer(self, raw_query, text_core, answer_window, x):
752821

753822
""" Takes a raw_query, text and answer_window as input and returns a context window around matches

0 commit comments

Comments
 (0)