Sweep run not closing

I am trying to run sweep for hyperparameter tuning. This was to get the best parameters according to val_loss and use those to retrain another model.
But the problem is that when I implement both these steps instead of creating a separate run for the retraining of model it repeats the last run of sweep.
A template of my script

#Login into account
wandb.login()

sweep_config_up = {
    'method': 'bayes',
    'metric':{
    'name': 'val_loss',
    'goal': 'minimize'
    },
    'parameters': {
        'dropout':{
          'values':[0.2, 0.25]
        },
        'hidden_layer_size':{
          'values':[128,256]
        },
        'layer_1_size':{
          'values':[8,16,32]
        },
        'layer_2_size': {
          'values': [32, 64, 96]
      },
          'decay':{
            'values':[1e-6, 1e-5]
          },
          'momentum':{
            'values':[0.85, 0.9]
          },
          'epoch': {
            'values' : [5, 10]
      },
          'learn_rate': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.01
        }
    }
}

def build_model(config, img_width, img_height, num_classes):
  config = config
  model = Sequential()
  model.add(Conv2D(config.layer_1_size, (5, 5), activation='relu',
                    input_shape=(img_width, img_height, 1)))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(config.layer_2_size, (5, 5), activation='relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(config.dropout))
  model.add(Flatten())
  model.add(Dense(config.hidden_layer_size, activation='relu'))
  model.add(Dense(num_classes, activation='softmax'))
  return model

def load_data(width, height):
  (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
  labels = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
            "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

  img_width, img_height = width, height

  X_train = X_train.astype('float32') / 255.
  X_test = X_test.astype('float32') / 255.

  # reshape input data -- add channel dimension
  X_train = X_train.reshape(X_train.shape[0], img_width, img_height, 1)
  X_test = X_test.reshape(X_test.shape[0], img_width, img_height, 1)

  # one hot encode outputs
  y_train = np_utils.to_categorical(y_train)
  y_test = np_utils.to_categorical(y_test)
  num_classes = y_test.shape[1]
  return X_train, y_train, X_test, y_test, num_classes, labels

def model_train():
  #intialize the wandb
  config_defaults = dict(
    dropout=0.2,
    hidden_layer_size=128,
    layer_1_size=16,
    layer_2_size=32,
    learn_rate=0.01,
    decay=1e-6,
    momentum=0.9,
    epoch=30,
    )
  run = wandb.init(reinit = True, config=config_defaults, magic=True, group='sweep_runings', job_type = 'training_new')
  with run: 
    config = wandb.config
    #specify height and width of the image
    img_width, img_height = 28, 28
    #load the data
    X_train, y_train, X_test, y_test, num_classes, labels = load_data(img_width, img_height)
    #build the model
    model = build_model(config, img_width, img_height, num_classes)
    #define the callbacks 
    callbacks = [WandbCallback(data_type="image", labels=labels)]
    #define the optimizer
    sgd = SGD(learning_rate=config.learn_rate, decay=config.decay, momentum=config.momentum,
            nesterov=True)
    #compile and fit the model
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    model.fit(X_train, y_train,  validation_data=(X_test, y_test),
            epochs=config.epoch,
            callbacks=callbacks
            )

sweep_id = wandb.sweep(sweep_config_up, project="sweeps")

wandb.agent(sweep_id, function = model_train, count=2)

wandb.finish()

print('--------------------------------------------------------finish_train model-------------------------------------------------')


print('--------------------------------------------------------Retrain model-------------------------------------------------')

#get best model paramaters
api = wandb.Api()
api_dir = 'some/sweeps/'
sweep = api.sweep(os.path.join(api_dir,sweep_id))

# Get best run parameters
best_run = sweep.best_run()
best_parameters = best_run.config
print(best_parameters)

wandb.finish()

model_path = os.getcwd()
model_name = 'model.h5'
prefix = 'cnn_model'

#retrain the model and log weights 
def retrain_model(best_parameters, model_path, model_name, prefix):
  project_name = "sweeps"
  run = wandb.init(reinit=True ,config=best_parameters, project = project_name, group='best_model', job_type = 'training_new_model' )
  #specify height and width of the image
  with run:
    config = wandb.config
    img_width, img_height = 28, 28
    #load the data
    X_train, y_train, X_test, y_test, num_classes, labels = load_data(img_width, img_height)
    #build the model
    model = build_model(config, img_width, img_height, num_classes)
    #define the callbacks 
    callbacks = [WandbCallback(data_type="image", labels=labels, log_weights=True)]
    #define the optimizer
    sgd = SGD(learning_rate=config.learn_rate, decay=config.decay, momentum=config.momentum,
          nesterov=True)
    #compile and fit the model
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    model.fit(X_train, y_train,  validation_data=(X_test, y_test),
          epochs=config.epoch,
          callbacks=callbacks
            )
    model.save(model_name)

    #weight model display structure
    weight_at = "_".join([prefix, "weights"])
    #create artifact
    weight_model_at = wandb.Artifact(weight_at, type="model_weight", metadata = best_parameters)            

    # save trained model as artifact
    weight_model_at.add_file(os.path.join(model_path, model_name))            
                
    # save artifact to W&B
    run.log_artifact(weight_model_at)

    run.finish()

retrain_model(best_parameters, model_path, model_name, prefix)

print('--------------------------------------------------------finish_Retrain model-------------------------------------------------')


Here is an image of the resulting runs

I would be extremely grateful if you can provide a suggestion or highlight the mistake. Thanks

Hi @hassanw65 , thank you for writing in and submitting code to reproduce your particular issue. We will review and get back to you soon.

Any update @mohammadbakir

Hi @hassanw65 ,

From your code, after running the sweep and grouping all its runs under sweep_runings and job type training_new, you retrieve the best run, sweep.best_run(), from the sweep and its associated configuration.

# Get best run parameters
best_run = sweep.best_run()
best_parameters = best_run.config

When calling sweep.best_run() The API first sorts runs by the val_loss score and returns the configuration of the run that most minimized the score (which could very much be the last run of the sweep). In the image attached you have a single run, thus the configurations of this run will be used.

You then pass these configurations to the retrain_model() function as best_parameters which are then used in the function to retrain the model under a new group, best_model and new job type training_new_model.

The single issue I see with your code that would cause a user to perceive unusual behavior when reading logs is the following.

In your model_train function() you are initializing a run with default config config_defaults that is defined within your function. This is passed to wandb.init(config=config_defaults...) and is used to log runs. These runs will be logged to the project uncategorized as no project name is defined in wandb.init. The sweep runs will be logged to the project sweeps per the line sweep_id = wandb.sweep(sweep_config_up, project="sweeps"). Was this intended by you?

1 Like

@muhmammadbakir The problem is that in the image attached I have done 2 run in sweep. But after the model is retrain again according to best parameter the last run of sweep is repeated and grouped. The run with the best parameter is not started at all.
However if I run sweep and then restart the notebook session and then I retrain according to best parameter then a new run with best config is started. This is with addition to 2 runs in sweep (Which is what I want).

And for your second question I havent project name in init because if I do it with addition to project name in sweep then I get error that I cant do it two times. Moreover in the wandb collab notebook in almost all sweep project name is defined inside the sweep instead of init in model_train function.
Moreover No run is created under uncategorized project.