addy-ai
diff --git a/‎src/train/server/src/app.py‎
Lines changed: 9 additions & 9 deletions b/‎src/train/server/src/app.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/train/server/src/requirements.txt‎
Lines changed: 34 additions & 39 deletions b/‎src/train/server/src/requirements.txt‎
Lines changed: 34 additions & 39 deletions
diff --git a/‎src/train/server/src/train.py‎
Lines changed: 65 additions & 27 deletions b/‎src/train/server/src/train.py‎
Lines changed: 65 additions & 27 deletions
@@ -41,23 +41,23 @@ def completion():
             return jsonify({"error": "Missing required params"}), 400
 
         # Get the required attributes from the request body
-        model_name = request.json["model"]
+        model_name = request.json["model_name"]
         training_data = request.json["training_data"]
         hf_token = request.json["hf_token"]
         deploy_to_hugging_face = request.json["deploy_to_hugging_face"]
         model_path = request.json["model_path"]
 
-        dataset_path = "" #TODO: Make CSV from json received in training data
-        # Save that CSV locally
+        print(training_data, hf_token, deploy_to_hugging_face, model_path)
 
-        llm_train = LLMTrain(model_name, dataset_path)
-        # Call make completion which calls LiteLLM which calls Vertex AI
-        endpont = ""
-        if not endpont:
-            raise ValueError("ResponseUndefined")
+        llm_train = LLMTrain(model_name, training_data)
+        llm_train.run_train(model_name, training_data, deploy_to_hugging_face, model_path)
+        
+        # endpont = ""
+        # if not endpont:
+        #     raise ValueError("ResponseUndefined")
 
         # Return response
-        return jsonify({"response": response,
+        return jsonify({"response": "",
                         "success": True}), 200
     except Exception as e:
         app.logger.error(str(e))
 
@@ -1,60 +1,55 @@
+accelerate @ git+https://github.com/huggingface/accelerate.git@c9fbb71e37e7f64f5df54b39270bdabe82f1b893
 aiohttp==3.8.6
 aiosignal==1.3.1
-appdirs==1.4.4
 async-timeout==4.0.3
 attrs==23.1.0
-blinker==1.6.2
-cachetools==5.3.1
+bitsandbytes==0.39.0
+blinker==1.7.0
 certifi==2023.7.22
-charset-normalizer==3.3.0
-click==8.1.3
-filelock==3.12.4
-Flask==2.3.2
-Flask-Cors==3.0.10
+charset-normalizer==3.3.2
+click==8.1.7
+datasets==2.12.0
+dill==0.3.6
+einops==0.6.1
+filelock==3.13.1
+Flask==3.0.0
 frozenlist==1.4.0
-fsspec==2023.9.2
-google-api-core==2.12.0
-google-auth==2.23.3
-google-cloud-aiplatform==1.35.0
-google-cloud-bigquery==3.12.0
-google-cloud-core==2.3.3
-google-cloud-resource-manager==1.10.4
-google-cloud-storage==2.12.0
-google-crc32c==1.5.0
-google-resumable-media==2.6.0
-googleapis-common-protos==1.61.0
-grpc-google-iam-v1==0.12.6
-grpcio==1.59.0
-grpcio-status==1.59.0
-huggingface-hub==0.17.3
+fsspec==2023.10.0
+gunicorn==21.2.0
+huggingface-hub==0.19.0
 idna==3.4
 importlib-metadata==6.8.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
-jsonify==0.5
-litellm==0.8.4
-MarkupSafe==2.1.2
+loralib==0.1.1
+MarkupSafe==2.1.3
+mpmath==1.3.0
 multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
 numpy==1.24.4
-openai==0.28.1
 packaging==23.2
-proto-plus==1.22.3
-protobuf==4.24.4
-pyasn1==0.5.0
-pyasn1-modules==0.3.0
+pandas==2.0.3
+peft @ git+https://github.com/huggingface/peft.git@42a184f7423fc0bbc102a085851a8fb6e40132ad
+psutil==5.9.6
+pyarrow==14.0.1
 python-dateutil==2.8.2
-python-dotenv==1.0.0
+pytz==2023.3.post1
 PyYAML==6.0.1
 regex==2023.10.3
 requests==2.31.0
-rsa==4.9
-shapely==2.0.2
+responses==0.18.0
+safetensors==0.4.0
 six==1.16.0
-tiktoken==0.5.1
-tokenizers==0.14.1
+sympy==1.12
+tokenizers==0.13.3
+torch==2.0.1
 tqdm==4.66.1
+transformers @ git+https://github.com/huggingface/transformers.git@e03a9cc0cd7623a8d5208d7a4206f628b2bd5513
 typing_extensions==4.8.0
-urllib3==2.0.6
-Werkzeug==2.3.4
+tzdata==2023.3
+urllib3==2.0.7
+Werkzeug==3.0.1
+xxhash==3.4.1
 yarl==1.9.2
-zipp==3.15.0
+zipp==3.17.0
@@ -6,7 +6,10 @@
 import torch
 import torch.nn as nn
 import transformers
-from datasets import load_dataset
+from datasets import (
+    load_dataset,
+    Dataset
+)
 from peft import (
     LoraConfig,
     PeftConfig,
@@ -27,30 +30,40 @@
 Train LLMs
 """
 
+
 class LLMTrain:
     # Initialize the class with model and data path
-    def __init__(self, MODEL_NAME, dataset_path) -> None:
+    def __init__(self, MODEL_NAME, training_data) -> None:
         self.MODEL_NAME = MODEL_NAME
-        self.dataset_path = dataset_path
+        self.training_data = training_data
 
     # Method to create transformer model and tokenizer
     def create_model_and_tokenizer(self):
-        # Define Quantization configuration to optimize model 
+        # Define Quantization configuration to optimize model
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            load_in_8bit_fp32_cpu_offload=True  # Set offloading to CPU.
         )
+        # Create a device map
+        device_map = {
+            0: ["transformer.h.0.", "transformer.h.1."],
+            1: ["transformer.h.2.", "transformer.h.3."],
+            -1: ["transformer.h.4.", "transformer.h.5.", "transformer.h.6.", "transformer.h.7."]
+        }
         # Create Transformer model based on given model name
         model = AutoModelForCausalLM.from_pretrained(
             self.MODEL_NAME,
-            device_map="auto",
+            device_map=device_map,   # Pass a custom device map,
             trust_remote_code=True,
             quantization_config=bnb_config
         )
         # Create a tokenizer for the designated model
         tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
+        tokenizer.pad_token = tokenizer.eos_token
+        self.tokenizer = tokenizer
         return model, tokenizer
 
     # Method to prepare and configure the model for training
@@ -66,32 +79,51 @@ def prepare_and_configure_model(self, model):
             bias="none",
             task_type="CAUSAL_LM"
         )
-        model = get_peft_model(model, config) # Apply the defined configuration to the model
+        # Apply the defined configuration to the model
+        model = get_peft_model(model, config)
         self.print_trainable_parameters(model)
         return model
 
     # Method to generate result based on user provided prompt
     def generate_future_with_prompt(self, model, tokenizer, prompt):
         generation_config = model.generation_config
         device = "cuda:0"
-        # Encoding the prompt using tokenizer 
+        # Encoding the prompt using tokenizer
         encoding = tokenizer(prompt, return_tensors="pt").to(device)
         with torch.inference_mode():
             outputs = model.generate(
-                input_ids = encoding.input_ids,
-                attention_mask = encoding.attention_mask,
-                generation_config = generation_config
+                input_ids=encoding.input_ids,
+                attention_mask=encoding.attention_mask,
+                generation_config=generation_config
             )
         print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
-    # Method to load and tokenize the dataset
-    def load_and_tokenize_data(self, tokenizer):
-        data = load_dataset("csv", data_files=self.dataset_path)
-        data = data["train"].shuffle().map(self.generate_and_tokenize_prompt)
-        return data
+    """
+    Method to load and tokenize the dataset
+    It expects an array of object each object of the format:
+    {
+        'input': '{{user_input}}',
+        'output': '{{model_output}}'
+    }
+    """
+
+    def load_training_data(self, data):
+        # Convert array of objects to dictionary format
+        data_dict = {
+            'input': [obj['input'] for obj in data],
+            'output': [obj['output'] for obj in data]
+        }
+        d = Dataset.from_dict(data_dict)
+        d = d.shuffle().map(
+            self.generate_and_tokenize_prompt,
+            batched=True,
+            remove_columns=["input", "output"],
+            load_from_cache_file=False
+        )
+        return d
 
     # Method to fine tune the model
-    def fine_tune_model(self, model, data, tokenizer):    
+    def fine_tune_model(self, model, data, tokenizer):
         training_args = transformers.TrainingArguments(
             per_device_train_batch_size=1,
             gradient_accumulation_steps=4,
@@ -109,24 +141,29 @@ def fine_tune_model(self, model, data, tokenizer):
             model=model,
             train_dataset=data,
             args=training_args,
-            data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
+            data_collator=transformers.DataCollatorForLanguageModeling(
+                tokenizer, mlm=False)
         )
         return trainer
 
     # Run a complete training cycle
-    def run_train(self, MODEL_NAME, dataset_path, deploy_to_hf, model_path):
+    def run_train(self, MODEL_NAME, training_data, deploy_to_hf, model_path):
         self.MODEL_NAME = MODEL_NAME
-        self.dataset_path = dataset_path
+        print("create_model_and_tokenizer")
         model, tokenizer = self.create_model_and_tokenizer()
+        print("prepare_and_configure_model")
         model = self.prepare_and_configure_model(model)
 
         prompt = """
         <human>: midjourney prompt for a girl sit on the mountain
         <assistant>:
         """.strip()
+        print("generating future with prompt")
         self.generate_future_with_prompt(model, tokenizer, prompt)
+        print("loading training data")
+        data = self.load_training_data(training_data)
+        print("\n\ndata:\n\n", data)
 
-        data = self.load_and_tokenize_data(tokenizer)
         trainer = self.fine_tune_model(model, data, tokenizer)
         trainer.train()
 
@@ -143,15 +180,16 @@ def deploy_to_hugging_face(self, model, model_path):
     # Generate dialog prompt with human and assistant tags
     def generate_prompt(self, data_point):
         return f"""
-        <human>: {data_point["User"]}
-        <assistant>: {data_point["Prompt"]}
+        <human>: {data_point["input"]}
+        <assistant>: {data_point["output"]}
         """.strip()
 
     # Tokenize the generated dialog prompt
     def generate_and_tokenize_prompt(self, data_point):
         full_prompt = self.generate_prompt(data_point)
         # padding and truncation are set to True for handling sequences of different length.
-        tokenized_full_prompt = self.tokenizer(full_prompt, padding=True, truncation=True)
+        tokenized_full_prompt = self.tokenizer(
+            full_prompt, padding=True, truncation=True)
         return tokenized_full_prompt
 
     # Print the number of parameters that are trainable in the model
@@ -163,9 +201,9 @@ def print_trainable_parameters(self, model):
         all_param = 0
 
         for _, param in model.named_parameters():
-            all_param += param.numel() # Total parameters
+            all_param += param.numel()  # Total parameters
             if param.requires_grad:
-                trainable_params += param.numel() # Trainable parameters
+                trainable_params += param.numel()  # Trainable parameters
         print(
             f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
-        )
+        )