# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import ctypes

from mxnet.test_utils import *
import os
import time
import argparse

from mxnet.base import check_call, _LIB

parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
args = parser.parse_args()

def measure_cost(repeat, f, *args, **kwargs):
    start = time.time()
    results = []
    for i in range(repeat):
        (f(*args, **kwargs)).wait_to_read()
    end = time.time()
    diff = end - start
    return diff / repeat


def run_cast_storage_synthetic():
    def dense_to_sparse(m, n, density, ctx, repeat, stype):
        set_default_context(ctx)
        data_shape = (m, n)
        dns_data = rand_ndarray(data_shape, stype, density).tostype('default')
        dns_data.wait_to_read()

        # do one warm up run, verify correctness
        assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())

        # start benchmarking
        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
        results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
        print(results)

    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))

    # params
    # m           number of rows
    # n           number of columns
    # density     density of the matrix
    # num_repeat  number of benchmark runs to average over
    # contexts    mx.cpu(), mx.gpu()
    #             note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
    # benchmarks  dns_to_csr, dns_to_rsp
    m = [  512,    512]
    n = [50000, 100000]
    density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
    num_repeat = 10
    contexts = [mx.gpu()]
    benchmarks = ["dns_to_csr", "dns_to_rsp"]

    # run benchmark
    for b in benchmarks:
        stype = ''
        print("==================================================")
        if b is "dns_to_csr":
            stype = 'csr'
            print(" cast_storage benchmark: dense to csr, size m x n ")
        elif b is "dns_to_rsp":
            stype = 'row_sparse'
            print(" cast_storage benchmark: dense to rsp, size m x n ")
        else:
            print("invalid benchmark: %s" %b)
            continue
        print("==================================================")
        headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
        print(headline)
        for i in range(len(n)):
            for ctx in contexts:
                for den in density:
                    dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
            print("")
        print("")


if __name__ == "__main__":
    run_cast_storage_synthetic()