Friday, January 4, 2019

[TensorFlow] How to write op with gradient in python?

Recently for some reasons, I studied the Domain-Adversarial Training of Neural Networks and it can be downloaded from

In this paper, there is the key point that we should implement "Gradient Reversal Layer" for Discriminator to use it to connect the feature extractor. I found the source to implement it by replacing Identity op's gradient function as follows:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow.python.framework import ops

class FlipGradientBuilder(object):
    def __init__(self):
        self.num_calls = 0

    def __call__(self, x, l=1.0):
        grad_name = "FlipGradient%d" % self.num_calls
        def _flip_gradients(op, grad):
            return [tf.negative(grad) * l]
        g = tf.get_default_graph()
        with g.gradient_override_map({"Identity": grad_name}):
            y = tf.identity(x)
        self.num_calls += 1
        return y
flip_gradient = FlipGradientBuilder()

So, how to use this Gradient Reversal Layer? I mark the line in red color as follows:
(The code is from )

import tensorflow as tf
from utils import *
from flip_gradient import flip_gradient

batch_size = 16

X = tf.placeholder(tf.float32, [None, 2], name='X') # Input data
Y_ind = tf.placeholder(tf.int32, [None], name='Y_ind')  # Class index
D_ind = tf.placeholder(tf.int32, [None], name='D_ind')  # Domain index
train = tf.placeholder(tf.bool, [], name='train')       # Switch for routing data to class predictor
l = tf.placeholder(tf.float32, [], name='l')        # Gradient reversal scaler

Y = tf.one_hot(Y_ind, 2)
D = tf.one_hot(D_ind, 2)

# Feature extractor - single layer
W0 = weight_variable([2, 15])
b0 = bias_variable([15])
F = tf.nn.relu(tf.matmul(X, W0) + b0, name='feature')

# Label predictor - single layer
f = tf.cond(train, lambda: tf.slice(F, [0, 0], [batch_size // 2, -1]), lambda: F)
y = tf.cond(train, lambda: tf.slice(Y, [0, 0], [batch_size // 2, -1]), lambda: Y)

W1 = weight_variable([15, 2])
b1 = bias_variable([2])
p_logit = tf.matmul(f, W1) + b1
p = tf.nn.softmax(p_logit)
p_loss = tf.nn.softmax_cross_entropy_with_logits(logits=p_logit, labels=y)

# Domain predictor - shallow
f_ = flip_gradient(F, l)

if shallow_domain_classifier:
    W2 = weight_variable([15, 2])
    b2 = bias_variable([2])
    d_logit = tf.matmul(f_, W2) + b2
    d = tf.nn.softmax(d_logit)
    d_loss = tf.nn.softmax_cross_entropy_with_logits(logits=d_logit, labels=D)

    W2 = weight_variable([15, 8])
    b2 = bias_variable([8])
    h2 = tf.nn.relu(tf.matmul(f_, W2) + b2)

    W3 = weight_variable([8, 2])
    b3 = bias_variable([2])
    d_logit = tf.matmul(h2, W3) + b3
    d = tf.nn.softmax(d_logit)
    d_loss = tf.nn.softmax_cross_entropy_with_logits(logits=d_logit, labels=D)

# Optimization
pred_loss = tf.reduce_sum(p_loss, name='pred_loss')
domain_loss = tf.reduce_sum(d_loss, name='domain_loss')
total_loss = tf.add(pred_loss, domain_loss, name='total_loss')

pred_train_op = tf.train.AdamOptimizer().minimize(pred_loss, name='pred_train_op')
domain_train_op = tf.train.AdamOptimizer().minimize(domain_loss, name='domain_train_op')
dann_train_op = tf.train.AdamOptimizer().minimize(total_loss, name='dann_train_op')

# Evaluation
p_acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(p, 1)), tf.float32), name='p_acc')
d_acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(D, 1), tf.argmax(d, 1)), tf.float32), name='d_acc')
I also capture the computation graph from tensorboard to prove it:
You can see that the Identity op's gradient function is replaced by ours "FlipGradient0"

P.S: For more in details

No comments: