import json import os from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments # Step 1: Convert text files to JSONL def convert_to_jsonl(input_dir, output_file): with open(output_file, 'w', encoding='utf-8') as outfile: for filename in os.listdir(input_dir): if filename.endswith('.txt'): with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as infile: text = infile.read() # Split text into manageable chunks (e.g., 512 tokens) chunks = [text[i:i+512] for i in range(0, len(text), 512)] for chunk in chunks: entry = {"text": chunk} json.dump(entry, outfile, ensure_ascii=False) outfile.write('\n') # Step 2: Fine-tune Mistral model def fine_tune_mistral(model_name, train_file, output_dir): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) # Load JSONL dataset with open(train_file, 'r', encoding='utf-8') as f: lines = [json.loads(line) for line in f] dataset = {"text": [line["text"] for line in lines]} tokenized_dataset = tokenize_function(dataset) training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=4, num_train_epochs=3, save_steps=1000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) trainer.train() model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Step 3: Generate text as Harry Potter def generate_as_harry(model_dir, prompt): tokenizer = AutoTokenizer.from_pretrained(model_dir) model = AutoModelForCausalLM.from_pretrained(model_dir) input_ids = tokenizer.encode( f"You are Harry Potter. {prompt}", return_tensors="pt", max_length=512, truncation=True ) outputs = model.generate(input_ids, max_length=200, num_return_sequences=1) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Example usage if __name__ == "__main__": input_dir = "path/to/harry_potter_books" output_jsonl = "harry_potter.jsonl" model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" output_dir = "harry_potter_model" # Convert books to JSONL convert_to_jsonl(input_dir, output_jsonl) # Fine-tune model fine_tune_mistral(model_name, output_jsonl, output_dir) # Generate response as Harry Potter prompt = "How do you feel about facing Voldemort again?" response = generate_as_harry(output_dir, prompt) print(response)