In [None]:
# Upgrade PyTorch to 2.5.0
!pip install torch==2.5.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html

# Install Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

# Install necessary packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

from unsloth import FastLanguageModel
import torch
import os
import json

# Set Hugging Face token
HF_TOKEN = 'hf_token'
os.environ['HF_TOKEN_READ'] = HF_TOKEN

# Define model parameters
max_sequence_length = 2048
model_dtype = None
load_in_4bit_mode = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen2.5-Coder-1.5B",
    max_seq_length=max_sequence_length,
    dtype=model_dtype,
    load_in_4bit=load_in_4bit_mode,
    token=os.environ.get('HF_TOKEN_READ'),
)

In [None]:
# Mount Google Drive to access and save files
from google.colab import drive
drive.mount('/content/drive')

# Define paths for input and output files on Google Drive
input_file_path = '/content/drive/MyDrive/fine_tune_data.json'  # Path to the preprocessed fine-tuning data
output_file_path = '/content/drive/MyDrive/formatted_dataset.jsonl'  # Output JSONL file on Google Drive

# Import necessary libraries
import json
from datasets import Dataset
from transformers import AutoTokenizer

# Load the tokenizer and set the EOS token
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B")  # Replace with your actual model
end_of_sequence_token = tokenizer.eos_token

# Load the fine-tuning data JSON file from Google Drive
with open(input_file_path, 'r', encoding='utf-8') as file:
    fine_tune_data = json.load(file)

# Convert the data to a Hugging Face Dataset
dataset = Dataset.from_list(fine_tune_data)

# Define the formatting template for combining fields into a single text
formatted_prompt_template = """Below is an instruction, an input, and a response that completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format each prompt-input-completion entry
def format_prompt_input_completion_pairs(examples):
    prompts = examples["prompt"]
    inputs = examples["input"]
    completions = examples["completion"]

    # Combine prompt, input, and completion with EOS token
    combined_texts = [
        formatted_prompt_template.format(prompt, input_text, completion) + end_of_sequence_token
        for prompt, input_text, completion in zip(prompts, inputs, completions)
    ]
    return {"text": combined_texts}

# Apply formatting to each entry in the dataset
formatted_dataset = dataset.map(format_prompt_input_completion_pairs, batched=True)

# Save the formatted dataset to JSONL in Google Drive, overwriting if it exists
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for entry in formatted_dataset:
        json.dump({"text": entry["text"]}, output_file)
        output_file.write('\n')

print(f"Formatted dataset saved to {output_file_path}")

In [None]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Lower rank reduces model capacity, helping avoid overfitting
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=8,  # Lower alpha can prevent overfitting
    lora_dropout=0.1,  # Small dropout to improve regularization
    bias="none",
    use_gradient_checkpointing="unsloth",  # Helps if long context lengths are needed
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Train and save the model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Use formatted_dataset which contains the text column
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    dataset_text_field="text",
    max_seq_length=max_sequence_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Lower batch size for small datasets
        gradient_accumulation_steps=8,  # Increase accumulation to maintain larger effective batch size
        num_train_epochs=2,  # Use more epochs to make the most of limited data
        warmup_ratio=0.1,  # Start with a small ratio to avoid large initial steps
        learning_rate=5e-5,  # Lower learning rate to prevent overfitting
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.0,  # Set to zero to avoid over-penalizing limited data
        lr_scheduler_type="cosine",  # Smoothly decays learning rate
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer_stats = trainer.train()

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
%cd llama.cpp
!make

In [None]:
# Save the model in all available GGUF quantization formats and upload to Hugging Face Hub
model.push_to_hub_gguf(
    "hf_username/model_name",
    tokenizer=tokenizer,
    quantization_method=["q4_k_m", "q5_k_m", "q8_0", "f16"],
    token="hf_token"
)