Created
June 14, 2023 09:11
-
-
Save ganindu7/80b18de9ab65eb90f02726bb4adcd837 to your computer and use it in GitHub Desktop.
I craftef this reference notebook to debug a problem where TAO training was getting stuck
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"\n", | |
"workdir = \"/home/ganindu/Workspace/AI/TAO\" # This is in my local pc where I run a pyenv virtualenv 3.10.11\n", | |
"datadir = f\"{workdir}/DATA\"\n", | |
"rootca = 'rootCA.pem' # self signed certificate\n", | |
"host_url = \"https://aisrv.gnet.lan:30907/tao-gnet\" # FIXME2 example: https://10.137.149.22:32334\n", | |
"ngc_api_key = \"a3NvZ2xvcn...\" \n", | |
"response = requests.get(f\"{host_url}/api/v1/login/{ngc_api_key}\", verify=rootca)\n", | |
"\n", | |
"# get user id and token \n", | |
"user_id = response.json()[\"user_id\"]\n", | |
"base_url = f\"{host_url}/api/v1/user/{user_id}\" # base url for all API calls\n", | |
"token = response.json()[\"token\"]\n", | |
"headers = {\"Authorization\": f\"Bearer {token}\"} # JWT token\n", | |
"\n", | |
"# print stuff so that we can check the connection is working\n", | |
"print(\"User ID\",user_id)\n", | |
"print(\"JWT\",token)\n", | |
"print(\"API Calls will be forwarded to\",base_url)\n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create a model \n", | |
"\n", | |
"network_arch = \"ssd\"\n", | |
"encode_key = None # default to tlt_encode?\n", | |
"model_name = \"net-debug\"\n", | |
"model_version = \"0.1.0\"\n", | |
"data = json.dumps({\"network_arch\":network_arch, \\\n", | |
" \"encode_key\":encode_key, \\\n", | |
" \"name\":model_name, \\\n", | |
" \"version\":model_version, \\\n", | |
" \"description\":\"A debug model for Nvidia Developer forum 256558\"}) \n", | |
"\n", | |
"endpoint = f\"{base_url}/model\"\n", | |
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n", | |
"\n", | |
"print(response)\n", | |
"print(response.json())\n", | |
"model_id = response.json()[\"id\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# List models to make that the model is created\n", | |
"endpoint = f\"{base_url}/model\"\n", | |
"response = requests.get(endpoint, headers=headers, verify=rootca)\n", | |
"model_list = []\n", | |
"print(f\"{'id':<40} {'name':<40} {'network_arch':<30} {'version':<15}\")\n", | |
"for rsp in response.json():\n", | |
" print(f\"{rsp['id']:<40} {rsp['name']:<40} {rsp['network_arch']:<30} {rsp['version']:<15}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# assign a dataset to the model\n", | |
"\n", | |
"model_ID = \"0c671979-5f80-5d61-96ed-ae0847a37e68\" # id of our modl \n", | |
"train_dataset_id = \"234108a7-ded2-4389-83a7-8a5efd270b9f\" # id of the dataset we want to use for training\n", | |
"eval_dataset_id = \"d7f763b4-9fd3-498d-94b8-b82ccab74fc5\" # id of the dataset we want to use for evaluation\n", | |
"test_dataset_id = \"8ae29c37-2c2f-4367-a58c-103ce4ee5904\" # id of the dataset we want to use for testing\n", | |
"\n", | |
"network_arch = \"ssd\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# pack it in \n", | |
"dataset_information = {\"train_datasets\" : [train_dataset_id], \n", | |
" \"eval_dataset\" : eval_dataset_id, \n", | |
" \"inference_dataset\" : test_dataset_id, \n", | |
" \"calibration_dataset\" : train_dataset_id\n", | |
" }\n", | |
"\n", | |
"data = json.dumps(dataset_information)\n", | |
"endpoint = f\"{base_url}/model/{model_ID}\"\n", | |
"response = requests.patch(endpoint, data=data, headers=headers, verify=rootca)\n", | |
"\n", | |
"print(response)\n", | |
"print(response.json())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# look for a suitable pretrained model\n", | |
"\n", | |
"model_list = f'{base_url}/model'\n", | |
"response = requests.get(model_list, headers=headers, verify=rootca)\n", | |
"response_json = response.json()\n", | |
"\n", | |
"# search for pre trianed model in ngc path\n", | |
"\n", | |
"ptm_id = None\n", | |
"model_index = -1\n", | |
"for rsp in response_json:\n", | |
" model_index += 1\n", | |
" if rsp['network_arch'] == network_arch and \"pretrained_object_detection:resnet18\" in rsp['ngc_path']:\n", | |
" ptm_id = rsp['id']\n", | |
" print(rsp)\n", | |
" break\n", | |
"\n", | |
"ssd_ptm = ptm_id\n", | |
"\n", | |
"# print the network architecture and ngc path of the model\n", | |
"print(f\"network arch preffered: {network_arch}\")\n", | |
"print(f\"network arch selectted: {response_json[model_index]['network_arch']}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Assign the pretrained model to the model we created\n", | |
"\n", | |
"ptm_information = {\"ptm\": ssd_ptm}\n", | |
"data = json.dumps(ptm_information)\n", | |
"endpoint = f\"{base_url}/model/{model_ID}\"\n", | |
"response = requests.patch(endpoint, data=data, headers=headers, verify=rootca)\n", | |
"\n", | |
"print(data)\n", | |
"print(response)\n", | |
"print(response.json())\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get training specs\n", | |
"endpoint = f\"{base_url}/model/{model_ID}/specs/train/schema\"\n", | |
"print(f'endpoint: {endpoint}')\n", | |
"response = requests.get(endpoint, headers=headers, verify=rootca)\n", | |
"print(response)\n", | |
"\n", | |
"specs = response.json()[\"default\"]\n", | |
"print(json.dumps(specs, sort_keys=True, indent=4))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# do a little customisation (we can't wait for ages to train a debug model)\n", | |
"\n", | |
"num_epochs = 10 # we can increase later \n", | |
"specs[\"training_config\"][\"num_epochs\"] = num_epochs\n", | |
"specs[\"eval_config\"][\"validation_period_during_training\"] = 2 # so we get updates to our plots often\n", | |
"\n", | |
"# are we visualizing the training in clearml\n", | |
"specs[\"training_config\"][\"visualizer\"][\"enabled\"] = True\n", | |
"\n", | |
"# clearml config section\n", | |
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"] = {}\n", | |
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"project\"] = \"test_stuff\"\n", | |
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"tags\"] = [\"training\", \"tao_toolkit\"]\n", | |
"specs[\"training_config\"][\"visualizer\"][\"clearml_config\"][\"task\"] = \"test_1\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# post updates spec \n", | |
"data = json.dumps(specs)\n", | |
"endpoint = f\"{base_url}/model/{model_ID}/specs/train\"\n", | |
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n", | |
"print(response)\n", | |
"print(json.dumps(response.json(), sort_keys=True, indent=4))\n", | |
"\n", | |
"traning_spec = json.dumps(response.json())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Try training \n", | |
"parent = None\n", | |
"actions = [\"train\"]\n", | |
"data = json.dumps({\"job\":parent, \"actions\":actions})\n", | |
"endpoint = f\"{base_url}/model/{model_ID}/job\"\n", | |
"response = requests.post(endpoint, data=data, headers=headers, verify=rootca)\n", | |
"print(response)\n", | |
"print(json.dumps(response.json(), sort_keys=True, indent=4))" | |
] | |
} | |
], | |
"metadata": { | |
"language_info": { | |
"name": "python" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment