InΒ [Β ]:
'''
Created on Tue Feb 10 13:38:04 2026
# This document and program will show setup and imports for Conda Environment for model training using Unsloth. A separate program
will be used for setup, imports, and Retrieval Augmented Generation (RAG) of text messages.
## Unsloth installation guide: https://unsloth.ai/docs/get-started/install/windows-installation
### Version of libraries are in constant change. After Feb 10, 2026, this will likely get you close but changes may be necessary.
@author: David DiPaola
'''
#####################################################################################################################################
#
# 1) Use Anaconda Prompt
# 2a) Clean Cache: conda clean --all -y
# 2b) pip cache purge
# 3) Create environment: conda create -n unsloth_gpu python=3.12 -y
# 4) Activate environment: conda activate unsloth_gpu
# 5) Install Pytorch and related libraries: pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
# 6a) Verify Pytorch version: python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())"
# 6b) Results: 2.10.0+cu13.0 and True
# 7a) Verify Torch CUDA Version and Availability: python -c "import torch; print(f'Torch CUDA Version: {torch.version.cuda}'); print(f'Is GPU Available: {torch.cuda.is_available()}')"
# 7b) Results: Torch CUDA Version: 13.0 and Is GPU Available: True
# 8) Verify CUDA version: nvidia-smi
# 9) Results: 13.1 (This is fine as a higher version works because backwards compatible; nvidia-smi reports the maximum CUDA API version your NVIDIA driver supports (forward compatibility indicator))
# 10a) Verify version of the NVIDIA CUDA Compiler currently active on your system: nvcc --version --> Results Compiler is 12.6 (still pointing to old version)
# 10b) Windows 10 Press Windows key β type edit the system environment variables
# 10c) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\libnvvp
# 10d) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64
# 10e) Edit and move to top (lower box System Variables): C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin
# 10f) Restart Anaconda Prompt
# 10g) Verify version of the NVIDIA CUDA Compiler currently active on your system: nvcc --version --> Results Compiler is 13.0
# 11) CUDA_PATH: Edit this variable to be C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0.
# 12) CUDA_PATH_V13.0: If this variable does not exist, create it. Set its value to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0
# 13) pip install unsloth
# 14) conda install spyder -y
# 15) A separate file will be used for Llama RAG as llama-cpp-python often conflicts with Unsloth and numerous attempts to resolve conflicts unsuccessful.
# 16) Save Conda Environment (once everything is working): conda env export > C:\Your\fold\unsloth_gpu_backup.yml
#
###################################################################################################################################
# Recommended Import Order Per Unsloth: https://unsloth.ai/docs/get-started/install/windows-installation
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, load_from_disk
# Imports for Message Training
import random
from transformers import TrainingArguments
'''
IMPORTANT: Please review both training training sequences before you run program and select one, commenting out the other.
'''
# Verify Torch and CUDA
print(f"Torch Version: {torch.__version__}")
print(f"Torch CUDA Version: {torch.version.cuda}")
print(f"CUDA Available: {torch.cuda.is_available()}")
# '''
# Sample script provided by Unsloth to verify installation (10 - 15 min runtime)
max_seq_length = 512
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/gemma-3-270m-it",
max_seq_length = max_seq_length, # Choose any for long context!
load_in_4bit = True, # 4-bit quantization. False = 16-bit LoRA.
load_in_8bit = False, # 8-bit quantization
load_in_16bit = False, # 16-bit LoRA
full_finetuning = False, # Use for full fine-tuning.
trust_remote_code = False, # Enable to support new models
# token = "hf_...", # use one if using gated models
)
# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
max_seq_length = max_seq_length,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
trainer = SFTTrainer(
model = model,
train_dataset = dataset,
tokenizer = tokenizer,
args = SFTConfig(
max_seq_length = max_seq_length,
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 10,
max_steps = 60,
logging_steps = 1,
output_dir = "outputs",
optim = "adamw_8bit",
seed = 3407,
dataset_num_proc = 1,
),
)
trainer.train()
# '''
'''
Expected Results
π¦₯ Unsloth Zoo will now patch everything to make training faster!
unified_chip2.jsonl: 100%|ββββββββββ| 95.6M/95.6M [00:02<00:00, 35.4MB/s]
Generating train split: 210289 examples [00:00, 1223394.42 examples/s]
==((====))== Unsloth 2026.2.1: Fast Gemma3 patching. Transformers: 4.57.6.
\\ /| NVIDIA GeForce RTX 3070. Num GPUs = 1. Max memory: 8.0 GB. Platform: Windows.
O^O/ \_/ \ Torch: 2.10.0+cu130. CUDA: 8.6. CUDA Toolkit: 13.0. Triton: 3.6.0
\ / Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
"-____-" Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
model.safetensors: 100%|ββββββββββ| 393M/393M [00:10<00:00, 37.6MB/s]
generation_config.json: 100%|ββββββββββ| 233/233 [00:00, ?B/s]
tokenizer_config.json: 1.16MB [00:00, ?B/s]
tokenizer.model: 100%|ββββββββββ| 4.69M/4.69M [00:00<00:00, 33.3MB/s]
tokenizer.json: 100%|ββββββββββ| 33.4M/33.4M [00:00<00:00, 42.1MB/s]
added_tokens.json: 100%|ββββββββββ| 35.0/35.0 [00:00, ?B/s]
special_tokens_map.json: 100%|ββββββββββ| 670/670 [00:00, ?B/s]
chat_template.jinja: 1.65kB [00:00, ?B/s]
Unsloth: Making `model.base_model.model.model` require gradients
Unsloth: Tokenizing ["text"]: 100%|ββββββββββ| 210289/210289 [00:37<00:00, 5558.03 examples/s]
π¦₯ Unsloth: Padding-free auto-enabled, enabling faster training.
==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1
\\ /| Num examples = 210,289 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \ Batch size per device = 2 | Gradient accumulation steps = 4
\ / Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
"-____-" Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)
Completed in 10 - 15 minutes
'''
# This is the training I performed on 10,000 text messages to fine-turn the Llama 3.2 3B LLM model for semantic search and query response / reasoning
# '''
if __name__ == '__main__':
# Data (raw text) of Text Messages Set for Training (10,000 Messages)
small_dataset = load_from_disk("C:/Your/Path/messages_10k_family_priority") # Load your own messages following procedure outlined in another file
model_path_local = "C:/Your/Path/llama3p2"
model, tokenizer = FastLanguageModel.from_pretrained(
model_path_local,
dtype = None,
load_in_4bit = True,
local_files_only = True,
)
model = FastLanguageModel.get_peft_model(
model,
r = 32, #64 previously reduces VRAM
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state=42,
)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = small_dataset,
dataset_text_field = "full_message_text",
max_seq_length = 512, #2048
args = TrainingArguments(
per_device_train_batch_size = 2, #8
gradient_accumulation_steps = 8, #2
warmup_steps = 10,
num_train_epochs = 3, # β 3 epochs = magic number for 10k
learning_rate = 2e-4,
fp16 = False,
bf16 = True,
logging_steps = 10,
output_dir = "llama3.2-3b-family-10k",
optim = "paged_adamw_8bit", #"adamw_8bit"
seed = 42,
gradient_checkpointing=True, # Already enabled via Unsloth, but reinforce
dataloader_num_workers=0, # Changed from 2 to 0 β see below, very little data hit
# These three lines help for low VRAM:
torch_compile=False, # sometimes hurts on 8GB
report_to="none", # ["tqdm"], # <-- Change this to redirect logs to tqdm text output
disable_tqdm=False,
),
)
# Optional but helps a lot:
model.gradient_checkpointing_enable() # Unsloth already does this, but safe
torch.cuda.empty_cache()
trainer.train() # β Starts now, finishes in ~2.5 hours
# '''