aws-samples
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎BedrockTextToSql_for_Athena.ipynb
Lines changed: 121 additions & 79 deletions b/‎BedrockTextToSql_for_Athena.ipynb
Lines changed: 121 additions & 79 deletions
@@ -0,0 +1 @@
+__pycache__/
@@ -81,15 +81,25 @@
     "**Prerequisite**\n",
     "\n",
     "The following are prerequisites that needs to be accomplised before executing this notebook.\n",
-    "- A Sagemaker instance with a role having access to bedrock, glue,athena, s3,lakeformation\n",
+    "This Notebook can be executed via a Sagemaker instance or via a VS Code editor\n",
+    "- Create a role having access to bedrock, glue,athena, s3,lakeformation. \n",
+    "- Assign the role to the Sagemaker instance or to the instance where VS Code editor is running\n",
     "- Glue Database and tables. Provided spark notebook to create.\n",
     "- An Amazon OpenSearch cluster for storing embeddings.Here Opensearch credenitals are in notebooks. However Opensearch cluster's access credentials (username and password) can be stored in AWS Secrets Mananger by following steps described [here](https://docs.aws.amazon.com/secretsmanager/latest/userguide/managing-secrets.html).\n",
     "\n",
-    "**The overall workflow for this notebook is as follows:**\n",
-    "1. Download data from source https://developer.imdb.com/non-commercial-datasets/#titleakastsvgz and upload to S3.\n",
-    "1. Create database and load datasets in Glue. Make sure see of the you are able to query through athena. \n",
-    "1. Install the required Python packages (specifically boto version mentioned)\n",
-    "1. Create embedding and vector store.Do a similarity search with embeddings stored in the OpenSearch index for an input query.\n",
+    "**The  workflow for this notebook is as follows:**\n",
+    "1. Create an S3 bucket with the name \"knowledgebase-<ACCOUNT_ID>\" \n",
+    "    - create a folder \"input\" in that bucket\n",
+    "2. Download data from source \n",
+    "    - https://developer.imdb.com/non-commercial-datasets/#titleakastsvgz and upload to S3 bucket from step 1 and into the \"input\" folder\n",
+    "    - https://developer.imdb.com/non-commercial-datasets/#titlebasicstsvgz and upload to S3 bucket from step 1 and into the \"input\" folder\n",
+    "3. Glue Steps\n",
+    "    - Create a glue database  \"imdb_stg\" \n",
+    "    - Create a glue crawler \"text-2-sql-crawler\" with the datasource set to the S3 bucket created in step 1.   Run the crawler.\n",
+    "    - 2 tables should be created in Glue  data catalo.g  Make sure you are able to  query through athena. \n",
+    "4.  From the Bedrock console, Create a new knowledgebase \n",
+    "1. Install the required Python packages \n",
+    "1. Create embedding and vector store. Do a similarity search with embeddings stored in the OpenSearch index for an input query.\n",
     "1. Execute this notebook to generate sql.."
    ]
   },
@@ -108,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "efc3af34-9c4b-4e95-9147-cf498a74a0c2",
    "metadata": {
     "pycharm": {
@@ -118,8 +128,16 @@
    },
    "outputs": [],
    "source": [
-    "# !pip3 install boto3==1.34.8\n",
-    "# !pip3 install jq"
+    "!pip3 install boto3\n",
+    "!pip3 install jq\n",
+    "\n",
+    "!pip3 install langchain\n",
+    "!pip3 install langchain-community langchain-core\n",
+    "!pip3 install pandas\n",
+    "!pip3 install opensearch-py\n",
+    "!pip3 install langchain-aws\n",
+    "!pip3 install requests-aws4auth\n",
+    "!pip3 install botocore"
    ]
   },
   {
@@ -148,13 +166,14 @@
    "source": [
     "import boto3\n",
     "from botocore.config import Config\n",
-    "from langchain.llms.bedrock import Bedrock\n",
-    "from langchain.embeddings import BedrockEmbeddings"
+    "from langchain_community.embeddings import BedrockEmbeddings\n",
+    "from langchain_aws import BedrockLLM\n",
+    "import traceback"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "id": "ef96ac35-3597-4bd8-80ca-035c6c98050b",
    "metadata": {
     "pycharm": {
@@ -168,15 +187,14 @@
     "import json\n",
     "import os,sys\n",
     "import re\n",
-    "sys.path.append(\"/home/ec2-user/SageMaker/llm_bedrock_v0/\")\n",
     "import time\n",
     "import pandas as pd\n",
     "import io"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "482a9055-6cc2-419c-839f-ca1326b04957",
    "metadata": {
     "pycharm": {
@@ -223,23 +241,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "436fb4f1-cd34-4146-bc96-da5e3608d720",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     },
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-tg1-large', 'modelId': 'amazon.titan-tg1-large', 'modelName': 'Titan Text Large', 'providerName': 'Amazon', 'inputModalities': ['TEXT'], 'outputModalities': ['TEXT'], 'responseStreamingSupported': True, 'customizationsSupported': [], 'inferenceTypesSupported': ['ON_DEMAND'], 'modelLifecycle': {'status': 'ACTIVE'}}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "session = boto3.session.Session()\n",
     "bedrock_client = session.client('bedrock')\n",
@@ -260,31 +270,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "72e44bd9-8787-447a-b5e0-0961547bafef",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     },
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "athena client created \n",
-      "s3 client created !!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "rqstath=AthenaQueryExecute()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c46db72d",
+   "metadata": {},
+   "source": [
+    "### Step 4.1 Update the variables"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "e6b0cedd-0fcb-4a3b-a7be-50bd25197230",
    "metadata": {
     "pycharm": {
@@ -293,7 +302,28 @@
    },
    "outputs": [],
    "source": [
-    "ebropen=EmbeddingBedrockOpenSearch()"
+    "\n",
+    "index_name = 'bedrock-knowledge-base-default-index'  \n",
+    "domain = 'https://OPENSEARCH.aoss.amazonaws.com' ##-- update here with your OpenSearch domain\n",
+    "region = 'us-east-1' ##-- update here with your AWS region\n",
+    "vector_name = 'bedrock-knowledge-base-default-vector'\n",
+    "fieldname = 'id'\n",
+    "   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9af3a226",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ebropen2=EmbeddingBedrockOpenSearch(domain,  vector_name,  fieldname)\n",
+    "if ebropen2 is None:\n",
+    "    print(\"ebropen2 is null\")\n",
+    "else:\n",
+    "    attrs = vars(ebropen2)\n",
+    "    print(', '.join(\"%s: %s\" % item for item in attrs.items()))"
    ]
   },
   {
@@ -313,7 +343,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "id": "fdcd87a9-f028-43ab-94a4-8d9b5b5bd163",
    "metadata": {
     "pycharm": {
@@ -324,15 +354,25 @@
    "outputs": [],
    "source": [
     "class RequestQueryBedrock:\n",
-    "    def __init__(self):\n",
-    "        # self.model_id = \"anthropic.claude-v2\"\n",
-    "        self.bedrock_client = Clientmodules.createBedrockRuntimeClient()\n",
+    "    def __init__(self, ebropen2):\n",
+    "    \n",
+    "        ##self.bedrock_client = Clientmodules.createBedrockRuntimeClient()\n",
+    "        self.ebropen2 = ebropen2\n",
+    "  \n",
+    "\n",
+    "        self.bedrock_client = ebropen2.bedrock_client\n",
+    "        if self.bedrock_client is None:\n",
+    "            self.bedrock_client = Clientmodules.createBedrockRuntimeClient()\n",
+    "        else : \n",
+    "            print(\"the bedrock_client is not null\")\n",
     "        self.language_model = LanguageModel(self.bedrock_client)\n",
     "        self.llm = self.language_model.llm\n",
-    "    def getOpenSearchEmbedding(self,index_name,user_query):\n",
-    "        vcindxdoc=ebropen.getDocumentfromIndex(index_name=index_name)\n",
-    "        documnet=ebropen.getSimilaritySearch(user_query,vcindxdoc)\n",
-    "        return ebropen.format_metadata(documnet)\n",
+    "        \n",
+    "    def getOpenSearchEmbedding(self, index_name,user_query):\n",
+    "        vcindxdoc=self.ebropen2.getDocumentfromIndex(index_name=index_name)\n",
+    "        documnet=self.ebropen2.getSimilaritySearch(user_query,vcindxdoc)\n",
+    "        #return self.ebropen2.format_metadata(documnet)\n",
+    "        return self.ebropen2.get_data(documnet)\n",
     "        \n",
     "    def generate_sql(self,prompt, max_attempt=4) ->str:\n",
     "            \"\"\"\n",
@@ -355,8 +395,9 @@
     "                    logger.info(f'we are in Try block to generate the sql and count is :{attempt+1}')\n",
     "                    generated_sql = self.llm.predict(prompt)\n",
     "                    query_str = generated_sql.split(\"```\")[1]\n",
-    "                    query_str = \" \".join(query_str.split(\"\\n\")).strip()\n",
+    "                    query_str = \" \".join(query_str.split(\"\\n\")).strip()                    \n",
     "                    sql_query = query_str[3:] if query_str.startswith(\"sql\") else query_str\n",
+    "                    print(sql_query)\n",
     "                    # return sql_query\n",
     "                    syntaxcheckmsg=rqstath.syntax_checker(sql_query)\n",
     "                    if syntaxcheckmsg=='Passed':\n",
@@ -374,6 +415,7 @@
     "                        prompts.append(prompt)\n",
     "                        attempt += 1\n",
     "                except Exception as e:\n",
+    "                    print(e)\n",
     "                    logger.error('FAILED')\n",
     "                    msg = str(e)\n",
     "                    error_messages.append(msg)\n",
@@ -382,32 +424,17 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "13da2f51-6032-4564-8943-3c36e55b025f",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "bedrock runtime client created \n"
-     ]
-    }
-   ],
+   "cell_type": "markdown",
+   "id": "d5cc21a1",
+   "metadata": {},
    "source": [
-    "rqst=RequestQueryBedrock()"
+    "Create an instance of  RequestQueryBedrock class"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "552419cd-3827-41ee-9aab-ca93c756b9c9",
+   "execution_count": null,
+   "id": "13da2f51-6032-4564-8943-3c36e55b025f",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -416,12 +443,12 @@
    },
    "outputs": [],
    "source": [
-    "index_name = 'llm_vector_db_metadata_indx2'"
+    "rqst=RequestQueryBedrock(ebropen2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "id": "db6b4e5f-b52e-4209-aab7-403dabc61239",
    "metadata": {
     "pycharm": {
@@ -434,10 +461,12 @@
     "def userinput(user_query):\n",
     "    logger.info(f'Searching metadata from vector store')\n",
     "    # vector_search_match=rqst.getEmbeddding(user_query)\n",
-    "    vector_search_match=rqst.getOpenSearchEmbedding(index_name,user_query)\n",
-    "    # print(vector_search_match)\n",
-    "    details=\"It is important that the SQL query complies with Athena syntax. During join if column name are same please use alias ex llm.customer_id in select statement. It is also important to respect the type of columns: if a column is string, the value should be enclosed in quotes. If you are writing CTEs then include all the required columns. While concatenating a non string column, make sure cast the column to string. For date columns comparing to string , please cast the string input.\"\n",
+    "    vector_search_match=rqst.getOpenSearchEmbedding( index_name,user_query)\n",
+    "  \n",
+    " \n",
+    "    details=\"It is important that the SQL query complies with Athena syntax. During join if column name are same please use alias ex llm.customer_id in select statement. It is also important to respect the type of columns: if a column is string, the value should be enclosed in quotes. If you are writing CTEs then include all the required columns. While concatenating a non string column, make sure cast the column to string. For date columns comparing to string , please cast the string input. Alwayws use the database name along with the table name\"\n",
     "    final_question = \"\\n\\nHuman:\"+details + vector_search_match + user_query+ \"n\\nAssistant:\"\n",
+    "    print(\"FINAL QUESTION :::\" + final_question)\n",
     "    answer = rqst.generate_sql(final_question)\n",
     "    return answer"
    ]
@@ -456,7 +485,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
    "id": "21d5b62a-4446-43d2-a4bf-6d51061160a9",
    "metadata": {
     "pycharm": {
@@ -466,7 +495,11 @@
    },
    "outputs": [],
    "source": [
-    "user_query='show me all the titles in US region'"
+    "#user_query='how many titles exist '\n",
+    "#user_query = 'show me top 10 title by user rating'\n",
+    "#user_query = 'show me top 10 titles in US region'\n",
+    "#user_query = 'which year was a movie/title  made'\n",
+    "user_query = 'how many titles are from the US region'"
    ]
   },
   {
@@ -528,14 +561,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "id": "3a1af6ad-036b-4eeb-b640-8642a75da17b",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   us_title_count\n",
+      "0         1534894\n"
+     ]
+    }
+   ],
    "source": [
     "print(QueryOutput)"
    ]
@@ -1178,9 +1220,9 @@
   ],
   "instance_type": "ml.t3.medium",
   "kernelspec": {
-   "display_name": "Python 3 (Data Science 3.0)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-310-v1"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1192,7 +1234,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,