前言
日常没空,留着以后写
llama-index简介
官网:https://docs.llamaindex.ai/en/stable/
简介也没空,以后再写
注:先说明,随着官方的变动,代码也可能变动,大家运行不起来,可以进官网查查资料
加载本地embedding模型
如果没有找到 llama_index.embeddings.huggingface
那么:pip install llama_index-embeddings-huggingface
还不行进入官网,输入huggingface进行搜索
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
Settings.embed_model = HuggingFaceEmbedding(
model_name=f"{embed_model_path}",device='cuda'
)
加载本地LLM模型
还是那句话,如果以下代码不行,进官网搜索Custom LLM Model
from llama_index.core.llms import (
CustomLLM,
CompletionResponse,
CompletionResponseGen,
LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
class GLMCustomLLM(CustomLLM):
context_window: int = 8192 # 上下文窗口大小
num_output: int = 8000 # 输出的token数量
model_name: str = "glm-4-9b-chat" # 模型名称
tokenizer: object = None # 分词器
model: object = None # 模型
dummy_response: str = "My response"
def __init__(self, pretrained_model_name_or_path):
super().__init__()
# GPU方式加载模型
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
# CPU方式加载模型
# self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
# self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
self.model = self.model.float()
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
# 得到LLM的元数据
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model_name,
)
# @llm_completion_callback()
# def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
# return CompletionResponse(text=self.dummy_response)
#
# @llm_completion_callback()
# def stream_complete(
# self, prompt: str, **kwargs: Any
# ) -> CompletionResponseGen:
# response = ""
# for token in self.dummy_response:
# response += token
# yield CompletionResponse(text=response, delta=token)
@llm_completion_callback() # 回调函数
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
# 完成函数
print("完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
return CompletionResponse(text=response)
@llm_completion_callback()
def stream_complete(
self, prompt: str, **kwargs: Any
) -> CompletionResponseGen:
# 流式完成函数
print("流式完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
for token in response:
yield CompletionResponse(text=token, delta=token)
基于本地模型搭建简易RAG
from typing import Any
from llama_index.core.llms import (
CustomLLM,
CompletionResponse,
CompletionResponseGen,
LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
class GLMCustomLLM(CustomLLM):
context_window: int = 8192 # 上下文窗口大小
num_output: int = 8000 # 输出的token数量
model_name: str = "glm-4-9b-chat" # 模型名称
tokenizer: object = None # 分词器
model: object = None # 模型
dummy_response: str = "My response"
def __init__(self, pretrained_model_name_or_path):
super().__init__()
# GPU方式加载模型
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
# CPU方式加载模型
# self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
# self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
self.model = self.model.float()
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
# 得到LLM的元数据
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model_name,
)
# @llm_completion_callback()
# def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
# return CompletionResponse(text=self.dummy_response)
#
# @llm_completion_callback()
# def stream_complete(
# self, prompt: str, **kwargs: Any
# ) -> CompletionResponseGen:
# response = ""
# for token in self.dummy_response:
# response += token
# yield CompletionResponse(text=response, delta=token)
@llm_completion_callback() # 回调函数
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
# 完成函数
print("完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
return CompletionResponse(text=response)
@llm_completion_callback()
def stream_complete(
self, prompt: str, **kwargs: Any
) -> CompletionResponseGen:
# 流式完成函数
print("流式完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
for token in response:
yield CompletionResponse(text=token, delta=token)
if __name__ == "__main__":
# 定义你的LLM
pretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'
Settings.embed_model = HuggingFaceEmbedding(
model_name=f"{embed_model_path}",device='cuda'
)
Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)
documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()
index = VectorStoreIndex.from_documents(
documents,
)
# 查询和打印结果
query_engine = index.as_query_engine()
response = query_engine.query("萧炎的表妹是谁?")
print(response)
ollama
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
documents = SimpleDirectoryReader("data").load_data()
# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
# ollama
Settings.llm = Ollama(model="llama3", request_timeout=360.0)
index = VectorStoreIndex.from_documents(
documents,
)
欢迎大家点赞或收藏
大家的点赞或收藏可以鼓励作者加快更新哟~
参加链接:
LlamaIndex中的CustomLLM(本地加载模型)
llamaIndex 基于GPU加载本地embedding模型
本站资源均来自互联网,仅供研究学习,禁止违法使用和商用,产生法律纠纷本站概不负责!如果侵犯了您的权益请与我们联系!
转载请注明出处: 免费源码网-免费的源码资源网站 » LLM之基于llama-index部署本地embedding与GLM-4模型并初步搭建RAG(其他大模型也可,附上ollma方式运行)
发表评论 取消回复