Рассмотрим пример
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
class TwoLinLayerNet(torch.nn.Module):| #!/bin/bash | |
| set -e -u -o pipefail -o noglob | |
| set -x | |
| CUDA_VERSION=${CUDA_VERSION:-10.2} | |
| CUDNN_VERSION=${CUDNN_VERSION:-7} | |
| TENSORRT_VERSION=${TENSORRT_VERSION:-7} | |
| UBUNTU_RELEASE=$(lsb_release -rs) # 18.04 | |
| DISTRO=ubuntu${UBUNTU_RELEASE//\./} # ubuntu1804 |
| PID=123 | |
| path=/home/user123/file.txt | |
| while test -d /proc/$PID; do | |
| hdfs dfs -put -f "$path" | |
| echo "[$(date '+%Y-%m-%d %H:%M:%S')] synced from $path, waiting for 1h..."; | |
| sleep 1h | |
| done | |
| echo "Sync $path" | |
| sleep 1h | |
| hdfs dfs -put -f "$path" |
| import os | |
| import torch as th | |
| import torch.distributed as dist | |
| import torch.multiprocessing as mp | |
| def run(rank: int, value: float, src:int, dst: int): | |
| tensor = th.FloatTensor([value,])#.to(f"cuda:{rank}") | |
| print(f"[rk={rank}] tensor before send-recv: {tensor}") | |
| req = dist.isend(tensor=tensor, dst=dst) |
Рассмотрим пример
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
class TwoLinLayerNet(torch.nn.Module):| Plan A: | |
| just run "sudo nvidia-uninstall" | |
| Plan B: | |
| #step 1. | |
| sudo su | |
| # step 2. delete all nvidia stuff | |
| dpkg -l | grep -i nvidia | awk '{print $2}' | xargs -n1 sudo apt-get purge -y |
| find commits by string of code for example by name of function: | |
| git rev-list --all --since 2022-04-01 --until 2022-05-01 | xargs git grep "<some pattern>" |