Untitled
In [ ]:
'''
Created on Tue Feb 10 13:38:04 2026

# This document and program will show setup and imports for Conda Environment for model training using Unsloth. A separate program
  will be used for setup, imports, and Retrieval Augmented Generation (RAG) of text messages.   

## Unsloth installation guide: https://unsloth.ai/docs/get-started/install/windows-installation

### Version of libraries are in constant change. After Feb 10, 2026, this will likely get you close but changes may be necessary.

@author: David DiPaola

'''

#####################################################################################################################################
#
# 1) Use Anaconda Prompt 
# 2a) Clean Cache: conda clean --all -y
# 2b) pip cache purge
# 3) Create environment: conda create -n unsloth_gpu python=3.12 -y
# 4) Activate environment: conda activate unsloth_gpu
# 5) Install Pytorch and related libraries: pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
# 6a) Verify Pytorch version: python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())"
# 6b) Results: 2.10.0+cu13.0 and True
# 7a) Verify Torch CUDA Version and Availability: python -c "import torch; print(f'Torch CUDA Version: {torch.version.cuda}'); print(f'Is GPU Available: {torch.cuda.is_available()}')"
# 7b) Results: Torch CUDA Version: 13.0 and Is GPU Available: True
# 8) Verify CUDA version: nvidia-smi 
# 9) Results: 13.1 (This is fine as a higher version works because backwards compatible; nvidia-smi reports the maximum CUDA API version your NVIDIA driver supports (forward compatibility indicator))
# 10a) Verify version of the NVIDIA CUDA Compiler currently active on your system: nvcc --version --> Results Compiler is 12.6 (still pointing to old version)
# 10b) Windows 10 Press Windows key → type edit the system environment variables
# 10c) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\libnvvp
# 10d) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64
# 10e) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin
# 10f) Restart Anaconda Prompt 
# 10g) Verify version of the NVIDIA CUDA Compiler currently active on your system: nvcc --version --> Results Compiler is 13.0
# 11) CUDA_PATH: Edit this variable to be C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0.
# 12) CUDA_PATH_V13.0: If this variable does not exist, create it. Set its value to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0
# 13) pip install unsloth
# 14) conda install spyder -y
# 15) A separate file will be used for Llama RAG as llama-cpp-python often conflicts with Unsloth and numerous attempts to resolve conflicts unsuccessful.
# 16) Save Conda Environment (once everything is working): conda env export > C:\Your\fold\unsloth_gpu_backup.yml 
#
###################################################################################################################################

# Recommended Import Order Per Unsloth: https://unsloth.ai/docs/get-started/install/windows-installation

from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, load_from_disk

# Imports for Message Training

import random
from transformers import TrainingArguments

'''
IMPORTANT: Please review both training training sequences before you run program and select one, commenting out the other.
'''


# Verify Torch and CUDA

print(f"Torch Version: {torch.__version__}")          
print(f"Torch CUDA Version: {torch.version.cuda}")         
print(f"CUDA Available: {torch.cuda.is_available()}")

# '''

# Sample script provided by Unsloth to verify installation (10 - 15 min runtime)

max_seq_length = 512
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True,  # 4-bit quantization. False = 16-bit LoRA.
    load_in_8bit = False, # 8-bit quantization
    load_in_16bit = False, # 16-bit LoRA
    full_finetuning = False, # Use for full fine-tuning.
    trust_remote_code = False, # Enable to support new models
    # token = "hf_...", # use one if using gated models
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
        dataset_num_proc = 1,
    ),
)
trainer.train()

# '''

'''

Expected Results

🦥 Unsloth Zoo will now patch everything to make training faster!
unified_chip2.jsonl: 100%|█████████▉| 95.6M/95.6M [00:02<00:00, 35.4MB/s]
Generating train split: 210289 examples [00:00, 1223394.42 examples/s]
==((====))==  Unsloth 2026.2.1: Fast Gemma3 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 3070. Num GPUs = 1. Max memory: 8.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.10.0+cu130. CUDA: 8.6. CUDA Toolkit: 13.0. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
model.safetensors: 100%|█████████▉| 393M/393M [00:10<00:00, 37.6MB/s] 
generation_config.json: 100%|██████████| 233/233 [00:00
tokenizer_config.json: 1.16MB [00:00, ?B/s]
tokenizer.model: 100%|██████████| 4.69M/4.69M [00:00<00:00, 33.3MB/s]
tokenizer.json: 100%|██████████| 33.4M/33.4M [00:00<00:00, 42.1MB/s]
added_tokens.json: 100%|██████████| 35.0/35.0 [00:00
special_tokens_map.json: 100%|██████████| 670/670 [00:00
chat_template.jinja: 1.65kB [00:00, ?B/s]
Unsloth: Making `model.base_model.model.model` require gradients
Unsloth: Tokenizing ["text"]: 100%|██████████| 210289/210289 [00:37<00:00, 5558.03 examples/s]
🦥 Unsloth: Padding-free auto-enabled, enabling faster training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 210,289 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)
 
 Completed in 10 - 15 minutes

'''

# This is the training I performed on 10,000 text messages to fine-turn the Llama 3.2 3B LLM model for semantic search and query response / reasoning

# '''

if __name__ == '__main__':

    # Data (raw text) of Text Messages Set for Training (10,000 Messages)
    
    small_dataset = load_from_disk("C:/Your/Path/messages_10k_family_priority") # Load your own messages following procedure outlined in another file

    model_path_local = "C:/Your/Path/llama3p2"
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_path_local,
        dtype = None,
        load_in_4bit = True,
        local_files_only = True,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r = 32, #64 previously reduces VRAM
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state=42,
    )
    
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = small_dataset,
        dataset_text_field = "full_message_text",
        max_seq_length = 512, #2048
        args = TrainingArguments(
            per_device_train_batch_size = 2, #8 
            gradient_accumulation_steps = 8, #2
            warmup_steps = 10,
            num_train_epochs = 3,            # ← 3 epochs = magic number for 10k
            learning_rate = 2e-4,
            fp16 = False,
            bf16 = True,
            logging_steps = 10,
            output_dir = "llama3.2-3b-family-10k",
            optim = "paged_adamw_8bit", #"adamw_8bit"
            seed = 42,
            gradient_checkpointing=True,     # Already enabled via Unsloth, but reinforce
            dataloader_num_workers=0,        # Changed from 2 to 0 — see below, very little data hit
            # These three lines help for low VRAM:
            torch_compile=False,             # sometimes hurts on 8GB
            report_to="none",                # ["tqdm"], # <-- Change this to redirect logs to tqdm text output
            disable_tqdm=False,
        ),
    )
    
    # Optional but helps a lot:
    model.gradient_checkpointing_enable()  # Unsloth already does this, but safe
    torch.cuda.empty_cache()
    
    trainer.train()   # ← Starts now, finishes in ~2.5 hours
    
# '''