shemliang
diff --git a/‎examples/assistant_audio.py‎
Lines changed: 28 additions & 0 deletions b/‎examples/assistant_audio.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎qwen_agent/agents/assistant.py‎
Lines changed: 2 additions & 3 deletions b/‎qwen_agent/agents/assistant.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎qwen_agent/agents/dialogue_retrieval_agent.py‎
Lines changed: 1 addition & 1 deletion b/‎qwen_agent/agents/dialogue_retrieval_agent.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qwen_agent/gui/web_ui.py‎
Lines changed: 3 additions & 1 deletion b/‎qwen_agent/gui/web_ui.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎qwen_agent/llm/__init__.py‎
Lines changed: 8 additions & 2 deletions b/‎qwen_agent/llm/__init__.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎qwen_agent/llm/qwenaudio_dashscope.py‎
Lines changed: 12 additions & 0 deletions b/‎qwen_agent/llm/qwenaudio_dashscope.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎qwen_agent/llm/qwenvl_dashscope.py‎
Lines changed: 16 additions & 0 deletions b/‎qwen_agent/llm/qwenvl_dashscope.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎qwen_agent/llm/schema.py‎
Lines changed: 14 additions & 6 deletions b/‎qwen_agent/llm/schema.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎qwen_agent/settings.py‎
Lines changed: 15 additions & 7 deletions b/‎qwen_agent/settings.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎qwen_agent/tools/doc_parser.py‎
Lines changed: 1 addition & 7 deletions b/‎qwen_agent/tools/doc_parser.py‎
Lines changed: 1 addition & 7 deletions
@@ -0,0 +1,28 @@
+from qwen_agent.agents import Assistant
+from qwen_agent.gui import WebUI
+
+
+def test():
+    bot = Assistant(llm={'model_type': 'qwenaudio_dashscope', 'model': 'qwen-audio-turbo-latest'})
+    messages = [{
+        'role':
+            'user',
+        'content': [{
+            'audio': 'https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3'
+        }, {
+            'text': '这段音频在说什么?'
+        }]
+    }]
+    for rsp in bot.run(messages):
+        print(rsp)
+
+
+def app_gui():
+    # Define the agent
+    bot = Assistant(llm={'model': 'qwen-audio-turbo-latest'})
+    WebUI(bot).run()
+
+
+if __name__ == '__main__':
+    # test()
+    app_gui()
@@ -1,9 +1,8 @@
 import copy
 import datetime
+import json
 from typing import Dict, Iterator, List, Literal, Optional, Union
 
-import json5
-
 from qwen_agent.agents.fncall_agent import FnCallAgent
 from qwen_agent.llm import BaseChatModel
 from qwen_agent.llm.schema import CONTENT, DEFAULT_SYSTEM_MESSAGE, ROLE, SYSTEM, Message
@@ -41,7 +40,7 @@ def format_knowledge_to_source_and_content(result: Union[str, List[dict]]) -> Li
     if isinstance(result, str):
         result = f'{result}'.strip()
         try:
-            docs = json5.loads(result)
+            docs = json.loads(result)
         except Exception:
             print_traceback()
             knowledge.append({'source': '上传的文档', 'content': result})
 
@@ -70,7 +70,7 @@ def _run(self,
         new_content = [ContentItem(text=query), ContentItem(file=file_path)]
         if isinstance(messages[-1].content, list):
             for item in messages[-1].content:
-                if item.file or item.image:
+                if item.file or item.image or item.audio:
                     new_content.append(item)
         new_messages.append(Message(role=USER, content=new_content))
 
 
@@ -7,7 +7,7 @@
 from qwen_agent.agents.user_agent import PENDING_USER_INPUT
 from qwen_agent.gui.gradio_utils import format_cover_html
 from qwen_agent.gui.utils import convert_fncall_to_text, convert_history_to_chatbot, get_avatar_image
-from qwen_agent.llm.schema import CONTENT, FILE, IMAGE, NAME, ROLE, USER, Message
+from qwen_agent.llm.schema import AUDIO, CONTENT, FILE, IMAGE, NAME, ROLE, USER, Message
 from qwen_agent.log import logger
 from qwen_agent.utils.utils import print_traceback
 
@@ -212,6 +212,8 @@ def add_text(self, _input, _chatbot, _history):
             for file in _input.files:
                 if file.mime_type.startswith('image/'):
                     _history[-1][CONTENT].append({IMAGE: 'file://' + file.path})
+                elif file.mime_type.startswith('audio/'):
+                    _history[-1][CONTENT].append({AUDIO: 'file://' + file.path})
                 else:
                     _history[-1][CONTENT].append({FILE: file.path})
 
 
@@ -6,6 +6,7 @@
 from .oai import TextChatAtOAI
 from .openvino import OpenVINO
 from .qwen_dashscope import QwenChatAtDS
+from .qwenaudio_dashscope import QwenAudioChatAtDS
 from .qwenvl_dashscope import QwenVLChatAtDS
 from .qwenvl_oai import QwenVLChatAtOAI
 
@@ -62,11 +63,15 @@ def get_chat_model(cfg: Union[dict, str] = 'qwen-plus') -> BaseChatModel:
 
     model = cfg.get('model', '')
 
-    if 'qwen-vl' in model:
+    if '-vl' in model.lower():
         model_type = 'qwenvl_dashscope'
         return LLM_REGISTRY[model_type](cfg)
 
-    if 'qwen' in model:
+    if '-audio' in model.lower():
+        model_type = 'qwenaudio_dashscope'
+        return LLM_REGISTRY[model_type](cfg)
+
+    if 'qwen' in model.lower():
         model_type = 'qwen_dashscope'
         return LLM_REGISTRY[model_type](cfg)
 
@@ -80,6 +85,7 @@ def get_chat_model(cfg: Union[dict, str] = 'qwen-plus') -> BaseChatModel:
     'TextChatAtAzure',
     'QwenVLChatAtDS',
     'QwenVLChatAtOAI',
+    'QwenAudioChatAtDS',
     'OpenVINO',
     'get_chat_model',
     'ModelServiceError',
 
@@ -0,0 +1,12 @@
+from typing import Dict, Optional
+
+from qwen_agent.llm.base import register_llm
+from qwen_agent.llm.qwenvl_dashscope import QwenVLChatAtDS
+
+
+@register_llm('qwenaudio_dashscope')
+class QwenAudioChatAtDS(QwenVLChatAtDS):
+
+    def __init__(self, cfg: Optional[Dict] = None):
+        super().__init__(cfg)
+        self.model = self.model or 'qwen-audio-turbo-latest'
@@ -93,6 +93,22 @@ def _format_local_files(messages: List[Message]) -> List[Message]:
                                 fname = fname.replace('\\', '/')
                             fname = 'file://' + fname
                             item.image = fname
+                if item.audio:
+                    fname = item.audio
+                    if not fname.startswith((
+                            'http://',
+                            'https://',
+                            'file://',
+                            'data:',  # base64 such as f"data:image/jpg;base64,{image_base64}"
+                    )):
+                        if fname.startswith('~'):
+                            fname = os.path.expanduser(fname)
+                        fname = os.path.abspath(fname)
+                        if os.path.isfile(fname):
+                            if re.match(r'^[A-Za-z]:\\', fname):
+                                fname = fname.replace('\\', '/')
+                            fname = 'file://' + fname
+                            item.audio = fname
     return messages
 
 
 
@@ -15,6 +15,7 @@
 
 FILE = 'file'
 IMAGE = 'image'
+AUDIO = 'audio'
 
 
 class BaseModelCompatibleDict(BaseModel):
@@ -64,9 +65,14 @@ class ContentItem(BaseModelCompatibleDict):
     text: Optional[str] = None
     image: Optional[str] = None
     file: Optional[str] = None
+    audio: Optional[str] = None
 
-    def __init__(self, text: Optional[str] = None, image: Optional[str] = None, file: Optional[str] = None):
-        super().__init__(text=text, image=image, file=file)
+    def __init__(self,
+                 text: Optional[str] = None,
+                 image: Optional[str] = None,
+                 file: Optional[str] = None,
+                 audio: Optional[str] = None):
+        super().__init__(text=text, image=image, file=file, audio=audio)
 
     @model_validator(mode='after')
     def check_exclusivity(self):
@@ -77,21 +83,23 @@ def check_exclusivity(self):
             provided_fields += 1
         if self.file:
             provided_fields += 1
+        if self.audio:
+            provided_fields += 1
 
         if provided_fields != 1:
-            raise ValueError("Exactly one of 'text', 'image', or 'file' must be provided.")
+            raise ValueError("Exactly one of 'text', 'image', 'file', or 'audio' must be provided.")
         return self
 
     def __repr__(self):
         return f'ContentItem({self.model_dump()})'
 
-    def get_type_and_value(self) -> Tuple[Literal['text', 'image', 'file'], str]:
+    def get_type_and_value(self) -> Tuple[Literal['text', 'image', 'file', 'audio'], str]:
         (t, v), = self.model_dump().items()
-        assert t in ('text', 'image', 'file')
+        assert t in ('text', 'image', 'file', 'audio')
         return t, v
 
     @property
-    def type(self) -> Literal['text', 'image', 'file']:
+    def type(self) -> Literal['text', 'image', 'file', 'audio']:
         t, v = self.get_type_and_value()
         return t
 
 
@@ -1,17 +1,25 @@
+import ast
+import os
 from typing import List, Literal
 
 # Settings for LLMs
-DEFAULT_MAX_INPUT_TOKENS: int = 28000  # The LLM will truncate the input messages if they exceed this limit
+DEFAULT_MAX_INPUT_TOKENS: int = int(os.getenv(
+    'QWEN_AGENT_DEFAULT_MAX_INPUT_TOKENS', 30000))  # The LLM will truncate the input messages if they exceed this limit
 
 # Settings for agents
-MAX_LLM_CALL_PER_RUN: int = 8
+MAX_LLM_CALL_PER_RUN: int = int(os.getenv('QWEN_AGENT_MAX_LLM_CALL_PER_RUN', 8))
 
 # Settings for tools
-DEFAULT_WORKSPACE: str = 'workspace'
+DEFAULT_WORKSPACE: str = os.getenv('QWEN_AGENT_DEFAULT_WORKSPACE', 'workspace')
 
 # Settings for RAG
-DEFAULT_MAX_REF_TOKEN: int = 4000  # The window size reserved for RAG materials
-DEFAULT_PARSER_PAGE_SIZE: int = 500  # Max tokens per chunk when doing RAG
+DEFAULT_MAX_REF_TOKEN: int = int(os.getenv('QWEN_AGENT_DEFAULT_MAX_REF_TOKEN',
+                                           20000))  # The window size reserved for RAG materials
+DEFAULT_PARSER_PAGE_SIZE: int = int(os.getenv('QWEN_AGENT_DEFAULT_PARSER_PAGE_SIZE',
+                                              500))  # Max tokens per chunk when doing RAG
 DEFAULT_RAG_KEYGEN_STRATEGY: Literal['None', 'GenKeyword', 'SplitQueryThenGenKeyword', 'GenKeywordWithKnowledge',
-                                     'SplitQueryThenGenKeywordWithKnowledge'] = 'SplitQueryThenGenKeyword'
-DEFAULT_RAG_SEARCHERS: List[str] = ['keyword_search', 'front_page_search']  # Sub-searchers for hybrid retrieval
+                                     'SplitQueryThenGenKeywordWithKnowledge'] = os.getenv(
+                                         'QWEN_AGENT_DEFAULT_RAG_KEYGEN_STRATEGY', 'GenKeyword')
+DEFAULT_RAG_SEARCHERS: List[str] = ast.literal_eval(
+    os.getenv('QWEN_AGENT_DEFAULT_RAG_SEARCHERS',
+              "['keyword_search', 'front_page_search']"))  # Sub-searchers for hybrid retrieval
@@ -4,7 +4,6 @@
 import time
 from typing import Dict, List, Optional, Union
 
-import json5
 from pydantic import BaseModel
 
 from qwen_agent.log import logger
@@ -90,12 +89,7 @@ def call(self, params: Union[str, dict], **kwargs) -> dict:
         try:
             # Directly load the chunked doc
             record = self.db.get(cached_name_chunking)
-            try:
-                record = json5.loads(record)
-            except ValueError:
-                logger.warning(
-                    f'Encountered ValueError raised by json5. Fall back to json. File: {cached_name_chunking}')
-                record = json.loads(record)
+            record = json.loads(record)
             logger.info(f'Read chunked {url} from cache.')
             return record
         except KeyNotExistsError: