66import torch
77import torch .nn as nn
88import transformers
9- from datasets import load_dataset
9+ from datasets import (
10+ load_dataset ,
11+ Dataset
12+ )
1013from peft import (
1114 LoraConfig ,
1215 PeftConfig ,
2730Train LLMs
2831"""
2932
33+
3034class LLMTrain :
3135 # Initialize the class with model and data path
32- def __init__ (self , MODEL_NAME , dataset_path ) -> None :
36+ def __init__ (self , MODEL_NAME , training_data ) -> None :
3337 self .MODEL_NAME = MODEL_NAME
34- self .dataset_path = dataset_path
38+ self .training_data = training_data
3539
3640 # Method to create transformer model and tokenizer
3741 def create_model_and_tokenizer (self ):
38- # Define Quantization configuration to optimize model
42+ # Define Quantization configuration to optimize model
3943 bnb_config = BitsAndBytesConfig (
4044 load_in_4bit = True ,
4145 bnb_4bit_use_double_quant = True ,
4246 bnb_4bit_quant_type = "nf4" ,
43- bnb_4bit_compute_dtype = torch .bfloat16
47+ bnb_4bit_compute_dtype = torch .bfloat16 ,
48+ load_in_8bit_fp32_cpu_offload = True # Set offloading to CPU.
4449 )
50+ # Create a device map
51+ device_map = {
52+ 0 : ["transformer.h.0." , "transformer.h.1." ],
53+ 1 : ["transformer.h.2." , "transformer.h.3." ],
54+ - 1 : ["transformer.h.4." , "transformer.h.5." , "transformer.h.6." , "transformer.h.7." ]
55+ }
4556 # Create Transformer model based on given model name
4657 model = AutoModelForCausalLM .from_pretrained (
4758 self .MODEL_NAME ,
48- device_map = "auto" ,
59+ device_map = device_map , # Pass a custom device map ,
4960 trust_remote_code = True ,
5061 quantization_config = bnb_config
5162 )
5263 # Create a tokenizer for the designated model
5364 tokenizer = AutoTokenizer .from_pretrained (self .MODEL_NAME )
65+ tokenizer .pad_token = tokenizer .eos_token
66+ self .tokenizer = tokenizer
5467 return model , tokenizer
5568
5669 # Method to prepare and configure the model for training
@@ -66,32 +79,51 @@ def prepare_and_configure_model(self, model):
6679 bias = "none" ,
6780 task_type = "CAUSAL_LM"
6881 )
69- model = get_peft_model (model , config ) # Apply the defined configuration to the model
82+ # Apply the defined configuration to the model
83+ model = get_peft_model (model , config )
7084 self .print_trainable_parameters (model )
7185 return model
7286
7387 # Method to generate result based on user provided prompt
7488 def generate_future_with_prompt (self , model , tokenizer , prompt ):
7589 generation_config = model .generation_config
7690 device = "cuda:0"
77- # Encoding the prompt using tokenizer
91+ # Encoding the prompt using tokenizer
7892 encoding = tokenizer (prompt , return_tensors = "pt" ).to (device )
7993 with torch .inference_mode ():
8094 outputs = model .generate (
81- input_ids = encoding .input_ids ,
82- attention_mask = encoding .attention_mask ,
83- generation_config = generation_config
95+ input_ids = encoding .input_ids ,
96+ attention_mask = encoding .attention_mask ,
97+ generation_config = generation_config
8498 )
8599 print (tokenizer .decode (outputs [0 ], skip_special_tokens = True ))
86100
87- # Method to load and tokenize the dataset
88- def load_and_tokenize_data (self , tokenizer ):
89- data = load_dataset ("csv" , data_files = self .dataset_path )
90- data = data ["train" ].shuffle ().map (self .generate_and_tokenize_prompt )
91- return data
101+ """
102+ Method to load and tokenize the dataset
103+ It expects an array of object each object of the format:
104+ {
105+ 'input': '{{user_input}}',
106+ 'output': '{{model_output}}'
107+ }
108+ """
109+
110+ def load_training_data (self , data ):
111+ # Convert array of objects to dictionary format
112+ data_dict = {
113+ 'input' : [obj ['input' ] for obj in data ],
114+ 'output' : [obj ['output' ] for obj in data ]
115+ }
116+ d = Dataset .from_dict (data_dict )
117+ d = d .shuffle ().map (
118+ self .generate_and_tokenize_prompt ,
119+ batched = True ,
120+ remove_columns = ["input" , "output" ],
121+ load_from_cache_file = False
122+ )
123+ return d
92124
93125 # Method to fine tune the model
94- def fine_tune_model (self , model , data , tokenizer ):
126+ def fine_tune_model (self , model , data , tokenizer ):
95127 training_args = transformers .TrainingArguments (
96128 per_device_train_batch_size = 1 ,
97129 gradient_accumulation_steps = 4 ,
@@ -109,24 +141,29 @@ def fine_tune_model(self, model, data, tokenizer):
109141 model = model ,
110142 train_dataset = data ,
111143 args = training_args ,
112- data_collator = transformers .DataCollatorForLanguageModeling (tokenizer , mlm = False )
144+ data_collator = transformers .DataCollatorForLanguageModeling (
145+ tokenizer , mlm = False )
113146 )
114147 return trainer
115148
116149 # Run a complete training cycle
117- def run_train (self , MODEL_NAME , dataset_path , deploy_to_hf , model_path ):
150+ def run_train (self , MODEL_NAME , training_data , deploy_to_hf , model_path ):
118151 self .MODEL_NAME = MODEL_NAME
119- self . dataset_path = dataset_path
152+ print ( "create_model_and_tokenizer" )
120153 model , tokenizer = self .create_model_and_tokenizer ()
154+ print ("prepare_and_configure_model" )
121155 model = self .prepare_and_configure_model (model )
122156
123157 prompt = """
124158 <human>: midjourney prompt for a girl sit on the mountain
125159 <assistant>:
126160 """ .strip ()
161+ print ("generating future with prompt" )
127162 self .generate_future_with_prompt (model , tokenizer , prompt )
163+ print ("loading training data" )
164+ data = self .load_training_data (training_data )
165+ print ("\n \n data:\n \n " , data )
128166
129- data = self .load_and_tokenize_data (tokenizer )
130167 trainer = self .fine_tune_model (model , data , tokenizer )
131168 trainer .train ()
132169
@@ -143,15 +180,16 @@ def deploy_to_hugging_face(self, model, model_path):
143180 # Generate dialog prompt with human and assistant tags
144181 def generate_prompt (self , data_point ):
145182 return f"""
146- <human>: { data_point ["User " ]}
147- <assistant>: { data_point ["Prompt " ]}
183+ <human>: { data_point ["input " ]}
184+ <assistant>: { data_point ["output " ]}
148185 """ .strip ()
149186
150187 # Tokenize the generated dialog prompt
151188 def generate_and_tokenize_prompt (self , data_point ):
152189 full_prompt = self .generate_prompt (data_point )
153190 # padding and truncation are set to True for handling sequences of different length.
154- tokenized_full_prompt = self .tokenizer (full_prompt , padding = True , truncation = True )
191+ tokenized_full_prompt = self .tokenizer (
192+ full_prompt , padding = True , truncation = True )
155193 return tokenized_full_prompt
156194
157195 # Print the number of parameters that are trainable in the model
@@ -163,9 +201,9 @@ def print_trainable_parameters(self, model):
163201 all_param = 0
164202
165203 for _ , param in model .named_parameters ():
166- all_param += param .numel () # Total parameters
204+ all_param += param .numel () # Total parameters
167205 if param .requires_grad :
168- trainable_params += param .numel () # Trainable parameters
206+ trainable_params += param .numel () # Trainable parameters
169207 print (
170208 f"trainable params: { trainable_params } || all params: { all_param } || trainables%: { 100 * trainable_params / all_param } "
171- )
209+ )
0 commit comments