Hi Wandb,
Goal:
Run a launch job from the wandb website to a launch-agent. The launch-agent compiles a docker image from downloaded artifacts and executes a training run in an environment in the docker image.
Issue:
The launch-agent “finishes” a launch job without any indication of a fail/crash. However, the logged run on the wandb website shows no logged results and the state “crashed”. The docker file entry point script however can run successfully on it’s own, separate of using launch, logging data to wandb and actually successfully completing the run. The “error.log”, “debug.log”, and “debug-internal.log” files have no indication of any errors either.
Dockerfile.wandb:
FROM python:3.8.12
WORKDIR /launch/Development
RUN apt-get update && apt-get install -y libgl1-mesa-glx
ENV WANDB_API_KEY=...
# copy requirements file to build environment
COPY Development/requirements.txt /launch/Development/requirements.txt
# install python dependencies
RUN pip install -r requirements.txt
RUN pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117
# copy the remaining project files
COPY . /launch
ENTRYPOINT ["python", "learn_manual.py"]
learn_manual.py:
class Agent:
def __init__(self, config, project, policy_kwargs):
self.config = config
self.policy_kwargs = policy_kwargs
self.temp_dir = None
#create temp working directory for model.zip
self._create_temp_folder()
#initialize wandb
self.run = wandb.init(project=project, mode="online", config=self.config, sync_tensorboard=True)
#generate environment
self.env = DummyVecEnv([self._make_train_env]) #vectorize environment
# self.env = make_vec_env(self._make_train_env, n_envs=2) #trying multiple environments
#generate model
self.model = SAC(
policy=wandb.config.policy_type,
env=self.env,
policy_kwargs=self.policy_kwargs,
# tensorboard_log=self.files_dict['log_dir'],
# action_noise= NormalActionNoise(mean=np.zeros(self.env.action_space.shape[-1]), sigma=0.1*np.ones(self.env.action_space.shape[-1])) if wandb.config.action_noise == True else None,
# learning_starts=wandb.config.learning_starts,
# learning_rate=wandb.config.learning_rate,
buffer_size=wandb.config.buffer_size,
batch_size=wandb.config.batch_size
return
def _make_train_env(self):
#create environment
env = PackCoordinator(skip_topo=True).create_env(env_render_mode="none")
#wrap envirnoment
env = Monitor(env)
env = TrainLogging(env)
return env
def _create_temp_folder(self):
self.temp_dir = tempfile.TemporaryDirectory()
return
def train(self):
'''
Train and save the model
'''
self.model.learn(
total_timesteps=wandb.config.total_timesteps,
log_interval=wandb.config.log_interval,
reset_num_timesteps=True,
progress_bar=True,
callback=WandbCallback(
model_save_path=Path(self.temp_dir.name) #upload saved model file to wandb on completion
)
)
self.model.save(Path(self.temp_dir.name)) #save model.zip to temp folder directory
return
def evaluate(self, eval_eps, num_samples):
eval_run = Evaluate(self.model)
eval_run.evaluate(eval_eps, num_samples)
return
def end_run(self):
self.run.finish()
self.temp_dir.cleanup() #clear the temp directory for uploaded model.zip
return
config = {
"log_interval": 1,
"algo": "sac", #sac, td3, ppo, a2c
"policy_type": "CnnPolicy", #MultiInputPolicy, CnnPolicy Note: Number of channels in ai_settings.py obs_shape will need to be adjusted
"total_timesteps": 256, #less than 100 steps, train metric won't log
# "learning_starts": 100,
"learning_rate": 0.0003,
"buffer_size": 256,
"batch_size": 64
}
policy_kwargs = dict(
features_extractor_class = CustomCNN,
features_extractor_kwargs = dict(features_dim=ai_set.grid_mapper_dim[0]),
normalize_images = False
)
sac_model = Agent(project="test", config=config, policy_kwargs=policy_kwargs) #initialize the model to be trained
sac_model.train() #train the model
sac_model.evaluate(eval_eps=20, num_samples=5) #evaluate the trained model
sac_model.end_run() #end the wandb run