aws-samples
diff --git a/‎BedrockTextToSql_for_Athena.ipynb
Lines changed: 374 additions & 106 deletions b/‎BedrockTextToSql_for_Athena.ipynb
Lines changed: 374 additions & 106 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 1 deletion b/‎README.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎__pycache__/athena_execution.cpython-312.pyc
4.09 KB b/‎__pycache__/athena_execution.cpython-312.pyc
4.09 KB
diff --git a/‎__pycache__/boto_client.cpython-312.pyc
2.64 KB b/‎__pycache__/boto_client.cpython-312.pyc
2.64 KB
diff --git a/‎__pycache__/llm_basemodel.cpython-312.pyc
1.24 KB b/‎__pycache__/llm_basemodel.cpython-312.pyc
1.24 KB
diff --git a/‎__pycache__/openSearchVCEmbedding.cpython-312.pyc
8.37 KB b/‎__pycache__/openSearchVCEmbedding.cpython-312.pyc
8.37 KB
diff --git a/‎athena_execution.py
Lines changed: 2 additions & 2 deletions b/‎athena_execution.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm_basemodel.py
Lines changed: 1 addition & 1 deletion b/‎llm_basemodel.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎openSearchVCEmbedding.py
Lines changed: 4 additions & 5 deletions b/‎openSearchVCEmbedding.py
Lines changed: 4 additions & 5 deletions
@@ -30,13 +30,15 @@ This post will address those challenges. First, we will include the meta-data of
 6.  Create a glue database "imdb_stg". Create a glue crawler and set the database name to be "imdb_stg" .  Start the glue crawler to crawl  the S3 bucket KB-<ACCOUNT_ID>/input location. It should create 2 tables in Glue catalog. 
     If you use another database name instead of "imdb_stg", update the file "idmb_schema.jsonl" the field of "database_name" to the exact name of the new glue database.
 7. Query the 2 tables via Athena to see that the data exists.
+   
 8. Create another folder in the S3 bucket KB-<ACCOUNT_ID> "/metadata". 
    - Upload the file "imdb_schema.jsonl"  into the metadata folder. 
 9. From the Bedrock console, 
     - Create a datasource with name = 'knowledge-base-movie-details-data-source' , type =  'Amazon S3',  pointing to the S3 foldercreated in step #8. Retain the 'Default chunking and parsing configuration'
     - Sync the 'knowledge-base-movie-details-data-source'. 
       Anytime new database changes are applied, dont forget to upload the revised "imdb_schema.jsonl" file to the S3 folder created in step #8 and do a sync . 
 10. Run the jupyter notebook   with the following caveats
+    - In the file of athena_execution.py replace   'ATHENA-OUTPUT-BUCKET' with the name of the bucket where Athena has actual write permissions to.
     - In the step 2 of this process walkthru, if the values for the index name, vector field , metadata field value are different substitute the new values  in the step "4.1 Update the variables" of the jupyter notebook. 
     - If you are running the jupyter notebook using  [Amazon Sagemaker - option 1](https://studiolab.sagemaker.aws/) or [Amazon Sagemaker - option 2](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-prepare.html) or VSCode , ensure the role or the user has the right set of permissions . 
 11. Continue with rest of the steps till Step 6 . At this stage, the process is ready to receive the query in natural language. 
@@ -48,7 +50,7 @@ This post will address those challenges. First, we will include the meta-data of
 17.	[Correction loop, if applicable] The new prompt now adds the Athena’s response. 
 18.	[Correction loop, if applicable] Create the corrected SQL and continue the process. This iteration can be performed multiple times.
 19.	Finally, execute SQL using Athena and generate output. Here, the output is presented to the user. For the sake of architectural simplicity, we did not show this step.
-    Since the # of records in the movie file are large and there is no athena partitioning , the queries can take upto 2 mins to execute. This can be optimized in many ways and its not described here. 
+    Since the # of records in the title file are > 10M and there is no athena partitioning , the queries can take upto 1-2 mins to execute. This can be optimized in many ways and its not described here. 
 
 ## Using the repo
 Please start with [the notebook](https://github.com/aws-samples/text-to-sql-for-athena/blob/main/BedrockTextToSql_for_Athena.ipynb)
 
@@ -15,7 +15,7 @@
 
 class AthenaQueryExecute:
     def __init__(self):
-        self.glue_databucket_name='vishal-bucket103'
+        self.glue_databucket_name='ATHENA-OUTPUT-BUCKET'
         self.athena_client=Clientmodules.createAthenaClient()
         self.s3_client=Clientmodules.createS3Client()
 
@@ -47,7 +47,7 @@ def execute_query(self, query_string):
         return df
 
     def syntax_checker(self,query_string):
-        print("Inside yntax_checker", query_string)
+        print("Inside syntax_checker", query_string)
         query_result_folder='athena_query_output/'
         query_config = {"OutputLocation": f"s3://{self.glue_databucket_name}/{query_result_folder}"}
         query_execution_context = {
 
@@ -11,7 +11,7 @@ def __init__(self,client):
         # Anthropic Claude     
         # Bedrock LLM
         inference_modifier = {
-                "max_tokens_to_sample": 3000,
+               ### "max_tokens_to_sample": 3000,
                 "temperature": 0,
                 "top_k": 20,
                 "top_p": 1,
 
@@ -186,23 +186,22 @@ def get_data(self,metadata):
 def main():
     print('main() executed')
     index_name1 = 'bedrock-knowledge-base-default-index'    
-    ##index_name1  = 'bedrock-knowledge-base-zttfoy'
-    domain = 'https://wi3kkhxignse60pcjop5.us-east-1.aoss.amazonaws.com'
+    domain = 'https://SAMPLE.us-east-1.aoss.amazonaws.com'
     vector_field = 'bedrock-knowledge-base-default-vector'
     fieldname = 'id'
     try:
         ebropen = EmbeddingBedrockOpenSearch (domain, vector_field, fieldname)
         ebropen.check_if_index_exists(index_name=index_name1, region='us-east-1',host=domain,http_auth=awsauth )
-        logger.info("now trying getdocument*************")    
+
         vcindxdoc=ebropen.getDocumentfromIndex(index_name=index_name1)
-        logger.info("now getting the title**************")
+
         user_query='show me all the titles in US region'
         document=ebropen.getSimilaritySearch(user_query,vcindex = vcindxdoc )
         ##print(document)
 
         #result = ebropen.format_metadata(document)
         result = ebropen.get_data(document)
-        print("\n\n****************888888888************")
+
         print(result)
     except Exception as e:
         print(e )