# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: skip-file
import numpy as np
import mxnet as mx
import numba
import logging

# We use numba.jit to implement the loss gradient.
@numba.jit
def mc_hinge_grad(scores, labels):
    scores = scores.asnumpy()
    labels = labels.asnumpy().astype(int)

    n, _ = scores.shape
    grad = np.zeros_like(scores)

    for i in range(n):
        score = 1 + scores[i] - scores[i, labels[i]]
        score[labels[i]] = 0
        ind_pred = score.argmax()
        grad[i, labels[i]] -= 1
        grad[i, ind_pred] += 1

    return grad

if __name__ == '__main__':
    n_epoch = 10
    batch_size = 100
    num_gpu = 2
    contexts = mx.context.cpu() if num_gpu < 1 else [mx.context.gpu(i) for i in range(num_gpu)]

    # build a MLP module
    data = mx.symbol.Variable('data')
    fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
    act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
    fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
    act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
    fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)

    mlp = mx.mod.Module(fc3, context=contexts)
    loss = mx.mod.PythonLossModule(grad_func=mc_hinge_grad)

    mod = mx.mod.SequentialModule() \
            .add(mlp) \
            .add(loss, take_labels=True, auto_wiring=True)

    train_dataiter = mx.io.MNISTIter(
            image="data/train-images-idx3-ubyte",
            label="data/train-labels-idx1-ubyte",
            data_shape=(784,),
            batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
    val_dataiter = mx.io.MNISTIter(
            image="data/t10k-images-idx3-ubyte",
            label="data/t10k-labels-idx1-ubyte",
            data_shape=(784,),
            batch_size=batch_size, shuffle=True, flat=True, silent=False)

    logging.basicConfig(level=logging.DEBUG)
    mod.fit(train_dataiter, eval_data=val_dataiter,
            optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
            num_epoch=n_epoch)