本文主要是介绍LLM之基于llama-index部署本地embedding与GLM-4模型并初步搭建RAG(其他大模型也可),希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
前言
日常没空,留着以后写
llama-index简介
官网:https://docs.llamaindex.ai/en/stable/
简介也没空,以后再写
注:先说明,随着官方的变动,代码也可能变动,大家运行不起来,可以进官网查查资料
加载本地embedding模型
如果没有找到 llama_index.embeddings.huggingface
那么:pip install llama_index-embeddings-huggingface
还不行进入官网,输入huggingface进行搜索
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SettingsSettings.embed_model = HuggingFaceEmbedding(model_name=f"{embed_model_path}",device='cuda')
加载本地LLM模型
还是那句话,如果以下代码不行,进官网搜索Custom LLM Model
from llama_index.core.llms import (CustomLLM,CompletionResponse,CompletionResponseGen,LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLMclass GLMCustomLLM(CustomLLM):context_window: int = 8192 # 上下文窗口大小num_output: int = 8000 # 输出的token数量model_name: str = "glm-4-9b-chat" # 模型名称tokenizer: object = None # 分词器model: object = None # 模型dummy_response: str = "My response"def __init__(self, pretrained_model_name_or_path):super().__init__()# GPU方式加载模型self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()# CPU方式加载模型# self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)# self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)self.model = self.model.float()@propertydef metadata(self) -> LLMMetadata:"""Get LLM metadata."""# 得到LLM的元数据return LLMMetadata(context_window=self.context_window,num_output=self.num_output,model_name=self.model_name,)# @llm_completion_callback()# def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:# return CompletionResponse(text=self.dummy_response)## @llm_completion_callback()# def stream_complete(# self, prompt: str, **kwargs: Any# ) -> CompletionResponseGen:# response = ""# for token in self.dummy_response:# response += token# yield CompletionResponse(text=response, delta=token)@llm_completion_callback() # 回调函数def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:# 完成函数print("完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])return CompletionResponse(text=response)@llm_completion_callback()def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:# 流式完成函数print("流式完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])for token in response:yield CompletionResponse(text=token, delta=token)
基于本地模型搭建简易RAG
from typing import Anyfrom llama_index.core.llms import (CustomLLM,CompletionResponse,CompletionResponseGen,LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbeddingclass GLMCustomLLM(CustomLLM):context_window: int = 8192 # 上下文窗口大小num_output: int = 8000 # 输出的token数量model_name: str = "glm-4-9b-chat" # 模型名称tokenizer: object = None # 分词器model: object = None # 模型dummy_response: str = "My response"def __init__(self, pretrained_model_name_or_path):super().__init__()# GPU方式加载模型self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()# CPU方式加载模型# self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)# self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)self.model = self.model.float()@propertydef metadata(self) -> LLMMetadata:"""Get LLM metadata."""# 得到LLM的元数据return LLMMetadata(context_window=self.context_window,num_output=self.num_output,model_name=self.model_name,)# @llm_completion_callback()# def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:# return CompletionResponse(text=self.dummy_response)## @llm_completion_callback()# def stream_complete(# self, prompt: str, **kwargs: Any# ) -> CompletionResponseGen:# response = ""# for token in self.dummy_response:# response += token# yield CompletionResponse(text=response, delta=token)@llm_completion_callback() # 回调函数def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:# 完成函数print("完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])return CompletionResponse(text=response)@llm_completion_callback()def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:# 流式完成函数print("流式完成函数")inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式outputs = self.model.generate(inputs, max_length=self.num_output)response = self.tokenizer.decode(outputs[0])for token in response:yield CompletionResponse(text=token, delta=token)if __name__ == "__main__":# 定义你的LLMpretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'Settings.embed_model = HuggingFaceEmbedding(model_name=f"{embed_model_path}",device='cuda')Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()index = VectorStoreIndex.from_documents(documents,)# 查询和打印结果query_engine = index.as_query_engine()response = query_engine.query("萧炎的表妹是谁?")print(response)
欢迎大家点赞或收藏
大家的点赞或收藏可以鼓励作者加快更新哟~
参加链接:
LlamaIndex中的CustomLLM(本地加载模型)
llamaIndex 基于GPU加载本地embedding模型
官网文档
这篇关于LLM之基于llama-index部署本地embedding与GLM-4模型并初步搭建RAG(其他大模型也可)的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!