Update.
This commit is contained in:
parent
49517fc71d
commit
de002c36ba
67
research_train_mistral.md
Normal file
67
research_train_mistral.md
Normal file
File diff suppressed because one or more lines are too long
79
research_train_mistral_code.py
Normal file
79
research_train_mistral_code.py
Normal file
@ -0,0 +1,79 @@
|
||||
import json
|
||||
import os
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
||||
|
||||
# Step 1: Convert text files to JSONL
|
||||
def convert_to_jsonl(input_dir, output_file):
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
for filename in os.listdir(input_dir):
|
||||
if filename.endswith('.txt'):
|
||||
with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as infile:
|
||||
text = infile.read()
|
||||
# Split text into manageable chunks (e.g., 512 tokens)
|
||||
chunks = [text[i:i+512] for i in range(0, len(text), 512)]
|
||||
for chunk in chunks:
|
||||
entry = {"text": chunk}
|
||||
json.dump(entry, outfile, ensure_ascii=False)
|
||||
outfile.write('\n')
|
||||
|
||||
# Step 2: Fine-tune Mistral model
|
||||
def fine_tune_mistral(model_name, train_file, output_dir):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
||||
|
||||
# Load JSONL dataset
|
||||
with open(train_file, 'r', encoding='utf-8') as f:
|
||||
lines = [json.loads(line) for line in f]
|
||||
dataset = {"text": [line["text"] for line in lines]}
|
||||
tokenized_dataset = tokenize_function(dataset)
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=output_dir,
|
||||
per_device_train_batch_size=4,
|
||||
num_train_epochs=3,
|
||||
save_steps=1000,
|
||||
save_total_limit=2,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset,
|
||||
)
|
||||
trainer.train()
|
||||
model.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
# Step 3: Generate text as Harry Potter
|
||||
def generate_as_harry(model_dir, prompt):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_dir)
|
||||
input_ids = tokenizer.encode(
|
||||
f"You are Harry Potter. {prompt}",
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
truncation=True
|
||||
)
|
||||
outputs = model.generate(input_ids, max_length=200, num_return_sequences=1)
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
input_dir = "path/to/harry_potter_books"
|
||||
output_jsonl = "harry_potter.jsonl"
|
||||
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
output_dir = "harry_potter_model"
|
||||
|
||||
# Convert books to JSONL
|
||||
convert_to_jsonl(input_dir, output_jsonl)
|
||||
|
||||
# Fine-tune model
|
||||
fine_tune_mistral(model_name, output_jsonl, output_dir)
|
||||
|
||||
# Generate response as Harry Potter
|
||||
prompt = "How do you feel about facing Voldemort again?"
|
||||
response = generate_as_harry(output_dir, prompt)
|
||||
print(response)
|
Loading…
Reference in New Issue
Block a user