Skip to content

Instantly share code, notes, and snippets.

@ganindu7
Created June 14, 2023 09:11
Show Gist options
  • Save ganindu7/80b18de9ab65eb90f02726bb4adcd837 to your computer and use it in GitHub Desktop.
Save ganindu7/80b18de9ab65eb90f02726bb4adcd837 to your computer and use it in GitHub Desktop.
I craftef this reference notebook to debug a problem where TAO training was getting stuck
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"workdir = \"/home/ganindu/Workspace/AI/TAO\" # This is in my local pc where I run a pyenv virtualenv 3.10.11\n",
"datadir = f\"{workdir}/DATA\"\n",
"rootca = 'rootCA.pem' # self signed certificate\n",
"host_url = \"https://aisrv.gnet.lan:30907/tao-gnet\" # FIXME2 example: https://10.137.149.22:32334\n",
"ngc_api_key = \"a3NvZ2xvcn...\" \n",
"response = requests.get(f\"{host_url}/api/v1/login/{ngc_api_key}\", verify=rootca)\n",
"\n",
"# get user id and token \n",
"user_id = response.json()[\"user_id\"]\n",
"base_url = f\"{host_url}/api/v1/user/{user_id}\" # base url for all API calls\n",
"token = response.json()[\"token\"]\n",
"headers = {\"Authorization\": f\"Bearer {token}\"} # JWT token\n",
"\n",
"# print stuff so that we can check the connection is working\n",
"print(\"User ID\",user_id)\n",
"print(\"JWT\",token)\n",
"print(\"API Calls will be forwarded to\",base_url)\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create a model \n",
"\n",
"network_arch = \"ssd\"\n",
"encode_key = None # default to tlt_encode?\n",
"model_name = \"net-debug\"\n",
"model_version = \"0.1.0\"\n",
"data = json.dumps({\"network_arch\":network_arch, \\\n",
" \"encode_key\":encode_key, \\\n",
" \"name\":model_name, \\\n",
" \"version\":model_version, \\\n",
" \"description\":\"A debug model for Nvidia Developer forum 256558\"}) \n",
"\n",
"endpoint = f\"{base_url}/model\"\n",
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n",
"\n",
"print(response)\n",
"print(response.json())\n",
"model_id = response.json()[\"id\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# List models to make that the model is created\n",
"endpoint = f\"{base_url}/model\"\n",
"response = requests.get(endpoint, headers=headers, verify=rootca)\n",
"model_list = []\n",
"print(f\"{'id':<40} {'name':<40} {'network_arch':<30} {'version':<15}\")\n",
"for rsp in response.json():\n",
" print(f\"{rsp['id']:<40} {rsp['name']:<40} {rsp['network_arch']:<30} {rsp['version']:<15}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# assign a dataset to the model\n",
"\n",
"model_ID = \"0c671979-5f80-5d61-96ed-ae0847a37e68\" # id of our modl \n",
"train_dataset_id = \"234108a7-ded2-4389-83a7-8a5efd270b9f\" # id of the dataset we want to use for training\n",
"eval_dataset_id = \"d7f763b4-9fd3-498d-94b8-b82ccab74fc5\" # id of the dataset we want to use for evaluation\n",
"test_dataset_id = \"8ae29c37-2c2f-4367-a58c-103ce4ee5904\" # id of the dataset we want to use for testing\n",
"\n",
"network_arch = \"ssd\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pack it in \n",
"dataset_information = {\"train_datasets\" : [train_dataset_id], \n",
" \"eval_dataset\" : eval_dataset_id, \n",
" \"inference_dataset\" : test_dataset_id, \n",
" \"calibration_dataset\" : train_dataset_id\n",
" }\n",
"\n",
"data = json.dumps(dataset_information)\n",
"endpoint = f\"{base_url}/model/{model_ID}\"\n",
"response = requests.patch(endpoint, data=data, headers=headers, verify=rootca)\n",
"\n",
"print(response)\n",
"print(response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# look for a suitable pretrained model\n",
"\n",
"model_list = f'{base_url}/model'\n",
"response = requests.get(model_list, headers=headers, verify=rootca)\n",
"response_json = response.json()\n",
"\n",
"# search for pre trianed model in ngc path\n",
"\n",
"ptm_id = None\n",
"model_index = -1\n",
"for rsp in response_json:\n",
" model_index += 1\n",
" if rsp['network_arch'] == network_arch and \"pretrained_object_detection:resnet18\" in rsp['ngc_path']:\n",
" ptm_id = rsp['id']\n",
" print(rsp)\n",
" break\n",
"\n",
"ssd_ptm = ptm_id\n",
"\n",
"# print the network architecture and ngc path of the model\n",
"print(f\"network arch preffered: {network_arch}\")\n",
"print(f\"network arch selectted: {response_json[model_index]['network_arch']}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Assign the pretrained model to the model we created\n",
"\n",
"ptm_information = {\"ptm\": ssd_ptm}\n",
"data = json.dumps(ptm_information)\n",
"endpoint = f\"{base_url}/model/{model_ID}\"\n",
"response = requests.patch(endpoint, data=data, headers=headers, verify=rootca)\n",
"\n",
"print(data)\n",
"print(response)\n",
"print(response.json())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get training specs\n",
"endpoint = f\"{base_url}/model/{model_ID}/specs/train/schema\"\n",
"print(f'endpoint: {endpoint}')\n",
"response = requests.get(endpoint, headers=headers, verify=rootca)\n",
"print(response)\n",
"\n",
"specs = response.json()[\"default\"]\n",
"print(json.dumps(specs, sort_keys=True, indent=4))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# do a little customisation (we can't wait for ages to train a debug model)\n",
"\n",
"num_epochs = 10 # we can increase later \n",
"specs[\"training_config\"][\"num_epochs\"] = num_epochs\n",
"specs[\"eval_config\"][\"validation_period_during_training\"] = 2 # so we get updates to our plots often\n",
"\n",
"# are we visualizing the training in clearml\n",
"specs[\"training_config\"][\"visualizer\"][\"enabled\"] = True\n",
"\n",
"# clearml config section\n",
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"] = {}\n",
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"project\"] = \"test_stuff\"\n",
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"tags\"] = [\"training\", \"tao_toolkit\"]\n",
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"task\"] = \"test_1\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# post updates spec \n",
"data = json.dumps(specs)\n",
"endpoint = f\"{base_url}/model/{model_ID}/specs/train\"\n",
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n",
"print(response)\n",
"print(json.dumps(response.json(), sort_keys=True, indent=4))\n",
"\n",
"traning_spec = json.dumps(response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Try training \n",
"parent = None\n",
"actions = [\"train\"]\n",
"data = json.dumps({\"job\":parent, \"actions\":actions})\n",
"endpoint = f\"{base_url}/model/{model_ID}/job\"\n",
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n",
"print(response)\n",
"print(json.dumps(response.json(), sort_keys=True, indent=4))"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment