Module AmpliVision.src.ML.utils

Functions used across machine learning workflows

Classes

class ML_Utils (path_to_imgs, scanned_path, id_str)
Expand source code
class  ML_Utils:
    def __init__(
        self, 
        path_to_imgs,
        scanned_path,
        id_str
        ):
    
        self.path_to_imgs = path_to_imgs #"data/scanned/*" #DENV_imgs/*"
        self.scanned_path = scanned_path #"data/scanned/" 
        self.id_str = id_str

        self.prepare_image_RBGen()

        self.PlotCallback = self.PlotCallback( id_str )


    def prepare_image_RBGen(self, display= False):
        """ Does initial setup needed to create RBG """

        # Phase A.1 - Scanning images
        Images = phaseA1(
            self.path_to_imgs, 
            self.scanned_path,
            display=display, 
            do_white_balance=False,
            is_pre_scanned="scanned" in self.path_to_imgs
        )
    
        # Phase A.2 - Grids
        Grids = phaseA2(Images, display=display)
        del Images

        # save test results
        self.results = phaseB(Grids, display=display)
        print(len(Grids))
     
        # Phase A.3 - Position Graph
        self.graphs = phaseA3(Grids, display=display)
        del Grids


    def build_dataset(
            self, 
            TARGETS, 
            BATCH_N, 
            SIZE,
            BLACK = False,
            OUTLIER = False,
            contamination = 0.05
             
        ):
        """ Creates a dataset using rule based generator to work with tensor flow """

        RBG = RuleBasedGenerator(self.graphs, self.results)
        RBG.setup()
        #save = True if OUTLIER else False # save

        _args = [ 
            TARGETS, # what TARGETS to generate
            0.05, # noise
            BLACK, # black background or no
            True, # rgb
            False, #save
        ]

        _args.append(contamination) if OUTLIER else None


        # transform generator into dataset
        g_dataset = tf.data.Dataset.from_generator(
            RBG.generate_for_od if OUTLIER else RBG.generate,  
            output_shapes=(
                [1242, 1242, 3], 
                2 if OUTLIER else [len(TARGETS)]
            ), 
            output_types=(tf.float32, tf.float32),
            args = _args
        )
        
        # dataset is (x_batch / 255, y_batch), with some random rotation
        g_dataset = g_dataset.map(
            lambda x, y: (
                # x - Image
                tf.cast(
                    tf.image.rot90(
                        tf.image.resize(
                            x, 
                            SIZE
                        ),
                        k = tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32)
                    ),
                    tf.float32
                ) / 255, 

                # y - Label
                tf.cast(y, tf.float32)
            ),
            num_parallel_calls=1
        )
        g_dataset = g_dataset.batch(batch_size=BATCH_N)
        g_dataset = g_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

        return g_dataset
    

    def test_dataset(self):
        """ Used to see if data is being generated correctly """

        # probably wrong order
        classes = ['lung', 'thyroid', 'ovarian', 'prostate', 'skin', 'control', 'breast']
        for img, label in self.build_dataset(BATCH_N = 1, BLACK=True).take(1):
            print(img.shape) 
            for i, im in enumerate(img):   
                print(f"\n{classes[np.where(label[i].numpy() == 1)[0][0]]}")
                #plt.imshow(im)
                #plt.show()

        #for img, label in self.build_dataset(BATCH_N = 1).take(1):
        #    print(img.shape) 
        #    for i, im in enumerate(img):   
        #        print(f"\n{classes[np.where(label[i].numpy() == 1)[0][0]]}")
                #plt.imshow(im)
                #plt.show()


    def plot_model_performance(self, history, fig_name):
        # summarize history for accuracy
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.savefig(fig_name+"_acc.png")
    
        plt.clf()

        # summarize history for loss
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.savefig(fig_name+"_loss.png")

        
    class PlotCallback(tf.keras.callbacks.Callback):
        def __init__(self, id_str):
            super(ML_Utils.PlotCallback, self).__init__()
            self.path = f"{os.getcwd()}/AmpliVision/data/"
            self.id_str = id_str

        def on_train_begin(self, logs={}):
            
            # accuracy and loss for each epoch
            self.epoch_accuracy = []
            self.epoch_val_accuracy = []
            self.epoch_loss = []
            self.epoch_val_loss = []

            # confusion matrix for each epoch
            self.epoch_confusion_matrix = []


        
        def on_epoch_end(self, epoch, logs={}):
            
            # plot the accuracy and loss
            self.plot_acc_loss(epoch, logs)

            # save the history of the model
            with open(self.path + "ML_models/" + f"history_{self.id_str}.pkl", 'wb') as file_pi:
                pkl.dump(self.model.history.history, file_pi)


        def plot_acc_loss(self, epoch, logs={}):
            # Append the metrics for each epoch
            self.epoch_accuracy.append(logs.get('accuracy'))
            self.epoch_val_accuracy.append(logs.get('val_accuracy'))
            self.epoch_loss.append(logs.get('loss'))
            self.epoch_val_loss.append(logs.get('val_loss'))
            
            # Clear the current plot to start a new one
            plt.clf()
            
            # Plot accuracy
            plt.subplot(1, 2, 1)
            plt.plot(self.epoch_accuracy, label='Training Accuracy')
            plt.plot(self.epoch_val_accuracy, label='Validation Accuracy')
            plt.title('Model Accuracy')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.legend(loc='upper left')
            
            # Plot loss
            plt.subplot(1, 2, 2)
            plt.plot(self.epoch_loss, label='Training Loss')
            plt.plot(self.epoch_val_loss, label='Validation Loss')
            plt.title('Model Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.legend(loc='upper right')
            
            # Save the figure to a file after each epoch
            file_path = self.path + "ML_perform/" + f"{self.id_str}.png"
            plt.savefig(file_path)
            print(f"Saved plot for epoch {epoch+1} at {file_path}")

Class variables

var PlotCallback

Abstract base class used to build new callbacks.

Callbacks can be passed to keras methods such as fit, evaluate, and predict in order to hook into the various stages of the model training and inference lifecycle.

To create a custom callback, subclass keras.callbacks.Callback and override the method associated with the stage of interest. See https://www.tensorflow.org/guide/keras/custom_callback for more information.

Example:

>>> training_finished = False
>>> class MyCallback(tf.keras.callbacks.Callback):
...   def on_train_end(self, logs=None):
...     global training_finished
...     training_finished = True
>>> model = tf.keras.Sequential([
...     tf.keras.layers.Dense(1, input_shape=(1,))])
>>> model.compile(loss='mean_squared_error')
>>> model.fit(tf.constant([[1.0]]), tf.constant([[1.0]]),
...           callbacks=[MyCallback()])
>>> assert training_finished == True

If you want to use Callback objects in a custom training loop:

  1. You should pack all your callbacks into a single callbacks.CallbackList so they can all be called together.
  2. You will need to manually call all the on_* methods at the appropriate locations in your loop. Like this:

Example:

   callbacks =  tf.keras.callbacks.CallbackList([...])
   callbacks.append(...)
   callbacks.on_train_begin(...)
   for epoch in range(EPOCHS):
     callbacks.on_epoch_begin(epoch)
     for i, data in dataset.enumerate():
       callbacks.on_train_batch_begin(i)
       batch_logs = model.train_step(data)
       callbacks.on_train_batch_end(i, batch_logs)
     epoch_logs = ...
     callbacks.on_epoch_end(epoch, epoch_logs)
   final_logs=...
   callbacks.on_train_end(final_logs)

Attributes

params
Dict. Training parameters (eg. verbosity, batch size, number of epochs…).
model
Instance of keras.models.Model. Reference of the model being trained.

The logs dictionary that callback methods take as argument will contain keys for quantities relevant to the current batch or epoch (see method-specific docstrings).

Methods

def build_dataset(self, TARGETS, BATCH_N, SIZE, BLACK=False, OUTLIER=False, contamination=0.05)

Creates a dataset using rule based generator to work with tensor flow

def plot_model_performance(self, history, fig_name)
def prepare_image_RBGen(self, display=False)

Does initial setup needed to create RBG

def test_dataset(self)

Used to see if data is being generated correctly