openvino.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import copy
  2. from pprint import pformat
  3. from threading import Thread
  4. from typing import Dict, Iterator, List, Optional
  5. from qwen_agent.llm.base import register_llm
  6. from qwen_agent.llm.function_calling import BaseFnCallModel
  7. from qwen_agent.llm.schema import ASSISTANT, Message
  8. from qwen_agent.log import logger
  9. from qwen_agent.utils.utils import build_text_completion_prompt
  10. @register_llm('openvino')
  11. class OpenVINO(BaseFnCallModel):
  12. """
  13. OpenVINO Pipeline API.
  14. To use, you should have the 'optimum[openvino]' python package installed.
  15. Example export and quantize openvino model by command line:
  16. optimum-cli export openvino --model Qwen/Qwen2-7B-Instruct --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8 Qwen2-7B-Instruct-ov
  17. Example passing pipeline in directly:
  18. llm_cfg = {
  19. 'ov_model_dir': 'Qwen2-7B-Instruct-ov',
  20. 'model_type': 'openvino',
  21. 'device': 'cpu'
  22. }
  23. system_instruction = '''You are a helpful assistant.
  24. After receiving the user's request, you should:
  25. - first draw an image and obtain the image url,
  26. - then run code `request.get(image_url)` to download the image,
  27. - and finally select an image operation from the given document to process the image.
  28. Please show the image using `plt.show()`.'''
  29. tools = ['my_image_gen', 'code_interpreter']
  30. files = ['./examples/resource/doc.pdf']
  31. bot = Assistant(llm=llm_cfg,
  32. system_message=system_instruction,
  33. function_list=tools,
  34. files=files)
  35. """
  36. def __init__(self, cfg: Optional[Dict] = None):
  37. super().__init__(cfg)
  38. if 'ov_model_dir' not in cfg:
  39. raise ValueError('Please provide openvino model directory through `ov_model_dir` in cfg.')
  40. try:
  41. from optimum.intel.openvino import OVModelForCausalLM
  42. except ImportError as e:
  43. raise ImportError('Could not import optimum-intel python package for openvino. '
  44. 'Please install it with: '
  45. "pip install -U 'optimum[openvino]'") from e
  46. try:
  47. from transformers import AutoConfig, AutoTokenizer
  48. except ImportError as e:
  49. raise ImportError('Could not import transformers python package for openvino. '
  50. 'Please install it with: '
  51. "pip install -U 'transformers'") from e
  52. self.ov_model = OVModelForCausalLM.from_pretrained(
  53. cfg['ov_model_dir'],
  54. device=cfg.get('device', 'cpu'),
  55. ov_config=cfg.get('ov_config', {}),
  56. config=AutoConfig.from_pretrained(cfg['ov_model_dir']),
  57. )
  58. self.tokenizer = AutoTokenizer.from_pretrained(cfg['ov_model_dir'])
  59. def _get_stopping_criteria(self, generate_cfg: dict):
  60. from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList
  61. class StopSequenceCriteria(StoppingCriteria):
  62. """
  63. This class can be used to stop generation whenever a sequence of tokens is encountered.
  64. Args:
  65. stop_sequences (`str` or `List[str]`):
  66. The sequence (or list of sequences) on which to stop execution.
  67. tokenizer:
  68. The tokenizer used to decode the model outputs.
  69. """
  70. def __init__(self, stop_sequences, tokenizer):
  71. if isinstance(stop_sequences, str):
  72. stop_sequences = [stop_sequences]
  73. self.stop_sequences = stop_sequences
  74. self.tokenizer = tokenizer
  75. def __call__(self, input_ids, scores, **kwargs) -> bool:
  76. decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
  77. return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)
  78. return StoppingCriteriaList([StopSequenceCriteria(generate_cfg['stop'], self.tokenizer)])
  79. def _chat_stream(
  80. self,
  81. messages: List[Message],
  82. delta_stream: bool,
  83. generate_cfg: dict,
  84. ) -> Iterator[List[Message]]:
  85. from transformers import TextIteratorStreamer
  86. generate_cfg = copy.deepcopy(generate_cfg)
  87. prompt = build_text_completion_prompt(messages)
  88. logger.debug(f'LLM Input:\n{pformat(prompt, indent=2)}')
  89. input_token = self.tokenizer(prompt, return_tensors='pt').input_ids
  90. streamer = TextIteratorStreamer(self.tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
  91. generate_cfg.update(
  92. dict(
  93. input_ids=input_token,
  94. streamer=streamer,
  95. max_new_tokens=generate_cfg.get('max_new_tokens', 2048),
  96. stopping_criteria=self._get_stopping_criteria(generate_cfg=generate_cfg),
  97. ))
  98. del generate_cfg['stop']
  99. def generate_and_signal_complete():
  100. self.ov_model.generate(**generate_cfg)
  101. t1 = Thread(target=generate_and_signal_complete)
  102. t1.start()
  103. partial_text = ''
  104. for new_text in streamer:
  105. partial_text += new_text
  106. if delta_stream:
  107. yield [Message(ASSISTANT, new_text)]
  108. else:
  109. yield [Message(ASSISTANT, partial_text)]
  110. def _chat_no_stream(
  111. self,
  112. messages: List[Message],
  113. generate_cfg: dict,
  114. ) -> List[Message]:
  115. generate_cfg = copy.deepcopy(generate_cfg)
  116. prompt = build_text_completion_prompt(messages)
  117. logger.debug(f'LLM Input:\n{pformat(prompt, indent=2)}')
  118. input_token = self.tokenizer(prompt, return_tensors='pt').input_ids
  119. generate_cfg.update(
  120. dict(
  121. input_ids=input_token,
  122. max_new_tokens=generate_cfg.get('max_new_tokens', 2048),
  123. stopping_criteria=self._get_stopping_criteria(generate_cfg=generate_cfg),
  124. ))
  125. del generate_cfg['stop']
  126. response = self.ov_model.generate(**generate_cfg)
  127. response = response[:, len(input_token[0]):]
  128. answer = self.tokenizer.batch_decode(response, skip_special_tokens=True)[0]
  129. return [Message(ASSISTANT, answer)]