import os
import re

import numpy as np
import pandas as pd
import numpy.random as rng

from sklearn.preprocessing import MultiLabelBinarizer

import PySimpleGUI as sg

from keras.layers import Input
from keras.models import Sequential
from keras.layers.core import Lambda, Dense
from keras.regularizers import l2
from keras.models import Model
from keras import backend as K
from keras.optimizers import Adam

# setup path to cached processed files with structured data
current_path = os.path.dirname(__file__)
file_source_relative_path = '\\jobs_web_scraper\\jobs_web_scraper\\spiders\\'

scraped_data_path = current_path + file_source_relative_path
output_df = pd.DataFrame()

# iterate file system and find scraped data csv files and merge them into a df
for root, dirs, files in os.walk(scraped_data_path):
    for directory in dirs:
        if re.match('.*scraped_data.*', directory):
            data_dir = scraped_data_path + directory
            for sub_root, sub_dirs, sub_files in os.walk(data_dir):
                for sub_file in sub_files:
                    if re.match('.*complete_scraped_jobs_unfiltered.*', sub_file):
                        to_be_appended = pd.read_csv(data_dir + '\\' + sub_file)
                        output_df = pd.concat([output_df, to_be_appended], ignore_index=True)
# data cleaning
# drop duplicates
output_df.drop_duplicates(inplace=True)

# dropping useless columns
output_df.drop(['main_text', 'tags', 'Firemní benefity', 'Jazyky', 'Lokalita', 'Určeno pro', 'Forma spolupráce'],
               axis=1, inplace=True)

# merging similiar columns and deleting the originals
cols = ['Klíčové dovednosti', 'Technologie používané na pozici', 'Ostatní dovednosti']
output_df["skill_req"] = output_df[cols].apply(lambda x: '; '.join(x.dropna()), axis=1)
output_df.drop(['Klíčové dovednosti', 'Technologie používané na pozici', 'Ostatní dovednosti'], axis=1, inplace=True)
output_df['skill_req'] = output_df['skill_req'].replace([''], np.NaN)

# fill nan
output_df[['Odměna', 'Požadovaná zkušenost']] = output_df[['Odměna', 'Požadovaná zkušenost']].fillna('dohodou')

# drop remaining NA
output_df.dropna(inplace=True)

# pass url to a separate list
url_df = output_df['url']

# drop identification info which is not useful for ML
output_df.drop(['company', 'url', 'Poslední aktualizace nabídky', 'position'], axis=1, inplace=True)

# data cleaning

# location
# delete numbers from names of locations and delete duplicates, ie.: Praha, Praha 1
output_df['location'] = output_df['location'].str.replace('[0-9]+', '', regex=True)
output_df['location'] = output_df['location'].str.replace(';', ' ', regex=True)
output_df['location'] = output_df['location'].str.replace(r'\b(\w+)\s+\1\b', r'\1', regex=True)
output_df['location'] = output_df['location'].str.replace('  ', '; ', regex=True)
output_df['location'] = output_df['location'].str.replace(';$', '', regex=True)

# odměna
# delete 'a vice' and whitespaces
output_df['Odměna'] = output_df['Odměna'].str.replace('a více', '', regex=True)
output_df['Odměna'] = output_df['Odměna'].str.replace('\s', '', regex=True)
# normalise all numbers to be in czk and per month
for ind in output_df.index:
    # check the format
    is_eur = re.match('.*€.*', output_df['Odměna'][ind])
    is_usd = re.match('.*\$.*', output_df['Odměna'][ind])

    is_hourly = re.match('.*/hodina.*', output_df['Odměna'][ind])

    is_dohodou = re.match('.*dohodou.*', output_df['Odměna'][ind])

    # find all available numbers and do the normalization
    if not is_dohodou:
        min_pay = re.search('^[0-9]+', output_df['Odměna'][ind]).group()
        try:
            max_pay = re.search('-[0-9]+', output_df['Odměna'][ind]).group()
        except AttributeError:
            max_pay = None

        if max_pay is not None:
            max_pay = max_pay.replace('-', '')

        if max_pay is None:
            max_pay = min_pay
            min_pay = 0

        min_pay = float(min_pay)
        max_pay = float(max_pay)

        if is_eur:
            min_pay *= 24.5
            max_pay *= 24.5
        if is_usd:
            min_pay *= 23
            max_pay *= 23

        if is_hourly:
            min_pay *= 163.05
            max_pay *= 163.05

        min_pay = round(min_pay, 2)
        max_pay = round(max_pay, 2)

        if min_pay == 0.0 and max_pay == 0.0:
            min_pay = -1
            max_pay = -1
    else:
        min_pay = -1
        max_pay = -1

    output_df['Odměna'][ind] = str(min_pay) + '; ' + str(max_pay)

# list2features
# transform list containing attributes with items delimited by ; into a python list
cols = list(output_df)
for ind in output_df.index:
    for col in cols:
        value = output_df[col][ind]
        tmp_list = re.findall('[^;]+', value)
        for index in range(len(tmp_list)):
            tmp_list[index] = tmp_list[index].strip()
            tmp_list[index] = tmp_list[index].lower()
        output_df[col][ind] = tmp_list

# one hot encoding and creation of dictionary for input form
dictionary = {}
for col in cols:
    if col != "Odměna":
        mlb = MultiLabelBinarizer()
        output_df = output_df.join(pd.DataFrame(mlb.fit_transform(output_df.pop(col)),
                                                columns=mlb.classes_,
                                                index=output_df.index))
        dictionary[col] = mlb.classes_

# reduce dataframe shape based on sum of '1' occurrences
# columns with too few occurrences get dropped as they provide very little information gain
cols = list(output_df)
counter_of_occurrences = []
for col in cols:
    if col != "Odměna":
        for name, cnt in output_df[col].value_counts().items():
            if name == 1:
                counter_of_occurrences.append([col, cnt])

counter_of_occurrences = sorted(counter_of_occurrences, key=lambda x: x[1])
for node in counter_of_occurrences[0:-300]:  # here you can define dimensionality of resulting dataframe for training
    output_df.drop([node[0]], axis=1, inplace=True)
    # delete deprecated columns from form dictionary
    for key, sub_list in dictionary.items():
        if node[0] in sub_list:
            tmp_sub_list = np.delete(sub_list, np.where(sub_list == node[0]), axis=0)
            dictionary[key] = tmp_sub_list

# split odměna
output_df['min_pay'] = ''
output_df['max_pay'] = ''
for idx in output_df.index:
    interval = output_df.loc[idx, 'Odměna']
    output_df.loc[idx, 'min_pay'] = interval[0]
    output_df.loc[idx, 'max_pay'] = interval[1]
output_df.drop(['Odměna'], axis=1, inplace=True)

# convert odmena to integers
for i in output_df.columns:
    try:
        output_df[[i]] = output_df[[i]].astype(float).astype(int)
    except:
        pass

# save encoded data to a tmp file
# print('prazdnych radku: ', sum)
print('dimenzionalita: ', output_df.shape[1])
output_df.to_csv(f"output_df_encoded.csv", index=False)

# create GUI
layout = [
    [sg.Text("Zadejte prosím svoje prefernce")],
    [sg.Frame(title="Lokalita", layout=[[sg.Combo(list(dictionary['location']), key='loc', readonly=True,
                                                  enable_events=True, default_value='(none)')]])],
    [sg.Frame(title="Úvazek", layout=[[sg.Combo(list(dictionary['Úvazek']), key='uvaz', readonly=True,
                                                enable_events=True, default_value='(none)')]])],
    [sg.Frame(title="Zkušenost",
              layout=[[sg.Combo(list(dictionary['Požadovaná zkušenost']), key='exp', readonly=True, enable_events=True,
                                default_value='(none)')]])],
    [sg.Frame(title="Klíčové dovednosti", layout=[[sg.Combo(list(dictionary['skill_req']), key='skill1',
                                                            readonly=True, enable_events=True, default_value='(none)')],
                                                  [sg.Combo(list(dictionary['skill_req']), key='skill2',
                                                            readonly=True, enable_events=True, default_value='(none)')],
                                                  [sg.Combo(list(dictionary['skill_req']), key='skill3',
                                                            readonly=True, enable_events=True,
                                                            default_value='(none)')]])],
    [sg.Frame(title="Odměna", layout=[
        [sg.Text("minimální hodnota")],
        [sg.Input(default_text='dohodou', key='min', enable_events=True)],
        [sg.Text("maximální hodnota")],
        [sg.Input(default_text='dohodou', key='max', enable_events=True)]])],
    [sg.Button("OK", enable_events=True)]
    # ,[sg.Output(size=(45,5))]
]

# Create the window
window = sg.Window("Vstupní formulář", layout)

# Create an event loop
while True:
    event, values = window.read()
    user_input_df = pd.DataFrame(0, index=np.arange(1), columns=output_df.columns)
    # Evaluate input if user presses OK
    if event == "OK":
        location = values['loc']
        work_hrs = values['uvaz']
        experience = values['exp']
        skill1 = values['skill1']
        skill2 = values['skill2']
        skill3 = values['skill3']
        wage_min = values['min']
        wage_max = values['max']

        if location == '(none)' or work_hrs == '(none)' or experience == '(none)' or skill1 == '(none)' \
                or skill2 == '(none)' or skill3 == '(none)':
            sg.popup('Vyplňte prosím všechny položky')
            continue
        if skill1 == skill2 or skill2 == skill3 or skill1 == skill3:
            sg.popup('Vyberte rozdílné klíčové dovednosti')
            continue
        if (wage_max == 'dohodou' and wage_min != 'dohodou') or (wage_max != 'dohodou' and wage_min == 'dohodou'):
            sg.popup('Odměna zadaná ve špatném formátu, zadejte dohodou:dohodou, nebo číslo:číslo')
            continue
        if not ((wage_max.isnumeric() and wage_min.isnumeric()) or (wage_min == 'dohodou' and wage_max == 'dohodou')):
            sg.popup('Odměna zadaná ve špatném formátu, zadejte dohodou:dohodou, nebo číslo:číslo')
            continue

        if wage_min != 'dohodou' and wage_max != 'dohodou':
            wage_min = float(wage_min)
            wage_max = float(wage_max)
        else:
            wage_max = -1
            wage_min = -1

            if wage_min > wage_max:
                sg.popup('Odměna zadaná ve špatném formátu, minimální hodnota musí být větší než maximální hodnota')
                continue

        user_input_df[location][0] = 1
        user_input_df[work_hrs][0] = 1
        user_input_df[experience][0] = 1
        user_input_df[skill1][0] = 1
        user_input_df[skill2][0] = 1
        user_input_df[skill3][0] = 1
        user_input_df['min_pay'][0] = wage_min
        user_input_df['max_pay'][0] = wage_max

        user_input_df.to_csv(f"user_input_df_encoded.csv", index=False)

        break
    # End program if user closes window
    if event == sg.WIN_CLOSED:
        break

window.close()


def generate_exact_match(reference_vector):
    # prepare empty user vector
    artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns)

    # setup flags that limit how many of the 1 actually get transferred to the user vector
    location_chosen = False
    uvazek_chosen = False
    zkusenost_chosen = False
    skill_req_counter = 0

    # assemble user vector
    for column in reference_vector.columns:
        if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1:
            location_chosen = True
            artif_user_vector[column][0] = 1
        if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1:
            uvazek_chosen = True
            artif_user_vector[column][0] = 1
        if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1:
            zkusenost_chosen = True
            artif_user_vector[column][0] = 1
        if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1:
            skill_req_counter += 1
            artif_user_vector[column][0] = 1

    # add a lil bit of rng to odmena
    # get the original values
    minimum_pay = reference_vector['min_pay'][0]
    maximum_pay = reference_vector['max_pay'][0]
    # if dohodou, then:
    if minimum_pay == -1 and maximum_pay == -1:
        coin_toss = rng.randint(0, 2)
        # keep dohodou
        if coin_toss == 0:
            artif_user_vector['min_pay'][0] = -1
            artif_user_vector['max_pay'][0] = -1
        # generate random value
        else:
            tmp_min = rng.randint(0, 200000)
            tmp_max = rng.randint(0, 200000)

            if tmp_max < tmp_min:
                swap = tmp_max
                tmp_max = tmp_min
                tmp_min = swap

            artif_user_vector['min_pay'][0] = tmp_min
            artif_user_vector['max_pay'][0] = tmp_max
    # else generate values within given interval
    else:
        interval_midpoint = (maximum_pay + minimum_pay) // 2

        artif_user_vector['min_pay'][0] = rng.randint(minimum_pay, interval_midpoint)
        artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay)

    return artif_user_vector.iloc[0].values


def generate_approx_match(reference_vector):
    # prepare empty user vector
    artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns)

    # setup flags that limit how many of the 1 actually get transferred to the user vector
    location_chosen = False
    uvazek_chosen = False
    zkusenost_chosen = False
    skill_req_counter = 0

    # assemble user vector
    for column in reference_vector.columns:
        if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1:
            location_chosen = True
            artif_user_vector[column][0] = 1
        if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1:
            uvazek_chosen = True
            artif_user_vector[column][0] = 1
        # random pozadovana zkusenost, doesn't have to match, but only if dohodou
        if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1:
            zkusenost_chosen = True
            if column == 'dohodou':
                while True:
                    coin_toss = rng.randint(0, len(dictionary['Požadovaná zkušenost']))
                    choice = dictionary['Požadovaná zkušenost'][coin_toss]
                    if choice != 'dohodou':
                        artif_user_vector[choice][0] = 1
                        break
            else:
                artif_user_vector[column][0] = 1
        # one of three random skill requirement, doesn't have to match exactly
        if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1:
            skill_req_counter += 1
            # randomise the last skill requirement
            if skill_req_counter == 2:
                coin_toss = rng.randint(0, len(dictionary['skill_req']))
                random_skill = dictionary['skill_req'][coin_toss]
                artif_user_vector[random_skill][0] = 1
            # others should match
            else:
                artif_user_vector[column][0] = 1

    # add a lil bit of rng to odmena
    # get the original values
    minimum_pay = reference_vector['min_pay'][0]
    maximum_pay = reference_vector['max_pay'][0]
    # if dohodou, then:
    if minimum_pay == -1 and maximum_pay == -1:
        coin_toss = rng.randint(0, 2)
        # keep dohodou
        if coin_toss == 0:
            artif_user_vector['min_pay'][0] = -1
            artif_user_vector['max_pay'][0] = -1
        # generate random value
        else:
            tmp_min = rng.randint(0, 200000)
            tmp_max = rng.randint(0, 200000)

            if tmp_max < tmp_min:
                swap = tmp_max
                tmp_max = tmp_min
                tmp_min = swap

            artif_user_vector['min_pay'][0] = tmp_min
            artif_user_vector['max_pay'][0] = tmp_max
    # else generate values within given interval (with some allowed overlap for approx match simulation)
    else:
        interval_midpoint = (maximum_pay + minimum_pay) // 2
        artif_user_vector['min_pay'][0] = rng.randint(0, interval_midpoint)
        artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay + 100000)

    return artif_user_vector.iloc[0].values


def generate_no_match(reference_vector):
    # prepare empty user vector
    artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns)

    # setup flags that limit how many of the 1 actually get transferred to the user vector
    location_chosen = False
    uvazek_chosen = False
    zkusenost_chosen = False
    skill_req_counter = 0

    # assemble user vector randomise everything
    for column in reference_vector.columns:
        if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1:
            location_chosen = True
            coin_toss = rng.randint(0, len(dictionary['location']))
            random_location = dictionary['location'][coin_toss]
            artif_user_vector[random_location][0] = 1
        if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1:
            uvazek_chosen = True
            coin_toss = rng.randint(0, len(dictionary['Úvazek']))
            random_uvazek = dictionary['Úvazek'][coin_toss]
            artif_user_vector[random_uvazek][0] = 1
        if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1:
            zkusenost_chosen = True
            coin_toss = rng.randint(0, len(dictionary['Požadovaná zkušenost']))
            random_experience = dictionary['Požadovaná zkušenost'][coin_toss]
            artif_user_vector[random_experience][0] = 1
        if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1:
            skill_req_counter += 1
            coin_toss = rng.randint(0, len(dictionary['skill_req']))
            random_skill = dictionary['skill_req'][coin_toss]
            artif_user_vector[random_skill][0] = 1

    # add a lil bit of rng to odmena
    # get the original values
    minimum_pay = reference_vector['min_pay'][0]
    maximum_pay = reference_vector['max_pay'][0]
    # if dohodou, then:
    if minimum_pay == -1 and maximum_pay == -1:
        coin_toss = rng.randint(0, 2)
        # keep dohodou
        if coin_toss == 0:
            artif_user_vector['min_pay'][0] = -1
            artif_user_vector['max_pay'][0] = -1
        # generate random value
        else:
            tmp_min = rng.randint(0, 200000)
            tmp_max = rng.randint(0, 200000)

            if tmp_max < tmp_min:
                swap = tmp_max
                tmp_max = tmp_min
                tmp_min = swap

            artif_user_vector['min_pay'][0] = tmp_min
            artif_user_vector['max_pay'][0] = tmp_max
    # else generate values within given interval (with some allowed overlap for approx match simulation)
    else:
        interval_midpoint = (maximum_pay + minimum_pay) // 2
        artif_user_vector['min_pay'][0] = rng.randint(0, interval_midpoint)
        artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay + 100000)

    return artif_user_vector.iloc[0].values


def get_batch(sub_batch_size):
    # get the encoded data
    encoded_data = pd.read_csv('output_df_encoded.csv')

    # get batch size
    batch_size = 4 * sub_batch_size

    # prepare empty matrix for reference (original) and user (artificial) vector pairs
    vector_size = encoded_data.shape[1]
    pair_matrix = [np.zeros((batch_size, vector_size), dtype=int) for i in range(2)]

    # prepare empty matrix for expected results
    result_matrix = np.zeros((batch_size,), dtype=int)

    # add exact matches
    for i in range(2 * sub_batch_size):
        # populate pair matrix with reference vectors
        rng_idx = rng.randint(0, encoded_data.shape[0])
        pair_matrix[0][i] = encoded_data.iloc[rng_idx]  # original vector
        result_matrix[i] = 1  # populate result

        # separate reference vector to be passed to function
        reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
        reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx]
        # create artificial user vector
        artif_user_vector = generate_exact_match(reference_vector)

        # populate matrix with artificial user vectors
        pair_matrix[1][i] = artif_user_vector
        result_matrix[i] = 1

    # add approximate matches
    # for i in range(sub_batch_size, 2 * sub_batch_size):
    #     # populate pair matrix with reference vectors
    #     rng_idx = rng.randint(0, encoded_data.shape[0])
    #     pair_matrix[0][i] = encoded_data.iloc[rng_idx]  # original vector
    #     result_matrix[i] = 1  # populate result
    #
    #     # separate reference vector to be passed to function
    #     reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
    #     reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx]
    #     # create artificial user vector
    #     artif_user_vector = generate_approx_match(reference_vector)
    #
    #     # populate matrix with artificial user vectors
    #     pair_matrix[1][i] = artif_user_vector
    #     result_matrix[i] = 1

    for i in range(2 * sub_batch_size, 4 * sub_batch_size):
        # populate pair matrix with reference vectors
        rng_idx = rng.randint(0, encoded_data.shape[0])
        pair_matrix[0][i] = encoded_data.iloc[rng_idx]  # original vector
        result_matrix[i] = 1  # populate result

        # separate reference vector to be passed to function
        reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
        reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx]
        # create artificial user vector
        artif_user_vector = generate_no_match(reference_vector)

        # populate matrix with artificial user vectors
        pair_matrix[1][i] = artif_user_vector
        result_matrix[i] = 0

    return pair_matrix, result_matrix


def initialize_weights(shape, dtype=None):
    return np.random.normal(loc=0.0, scale=1e-2, size=shape)


def initialize_bias(shape, dtype=None):
    return np.random.normal(loc=0.5, scale=1e-2, size=shape)


def get_siamese_model(input_shape):

    # Define the tensors for the two input vectors
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    # initialise the network
    model = Sequential()

    # Dense hidden layer
    hidden_l_perceptron_no = round(input_shape[0] * (2 / 3))
    model.add(Dense(hidden_l_perceptron_no, activation='relu', kernel_initializer=initialize_weights,
                    bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4)))

    # Generate the encodings for the two vectors
    encoded_l = model(left_input)
    encoded_r = model(right_input)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1, activation='sigmoid', bias_initializer=initialize_bias)(L1_distance)

    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    # return the model
    return siamese_net


def make_oneshot_task(sub_n_way):
    """Create pairs of offer vector, support set of dummy user vectors for testing N way one-shot learning."""
    encoded_data = pd.read_csv('output_df_encoded.csv')

    # obtain random original offer vector
    rng_idx = rng.randint(0, encoded_data.shape[0])
    offer_vector = encoded_data.iloc[rng_idx].values  # original offer vector

    # separate reference vector to be passed to function
    reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
    reference_vector.loc[encoded_data.index[0]] = offer_vector

    # generate artificial user vectors
    support_set = []
    targets = []
    multiplied_offer_vector = []

    # exact match
    artif_user_vector = generate_exact_match(reference_vector)
    support_set.append(artif_user_vector)
    targets.append(1)

    # approx match
    multiplied_offer_vector.append(offer_vector)
    for i in range(2*sub_n_way - 1):
        artif_user_vector = generate_approx_match(reference_vector)
        support_set.append(artif_user_vector)
        targets.append(1)
        multiplied_offer_vector.append(offer_vector)

    # no match
    for i in range(2 * sub_n_way):
        artif_user_vector = generate_no_match(reference_vector)
        support_set.append(artif_user_vector)
        targets.append(0)
        multiplied_offer_vector.append(offer_vector)

    pairs = [np.asarray(multiplied_offer_vector), np.asarray(support_set)]

    return pairs, np.asarray(targets)


def test_oneshot(model, sub_n_way, k_iter):
    """Test average N way oneshot learning accuracy of a siamese neural net over k one-shot tasks"""
    print("Evaluating model on {} random {} way one-shot learning tasks\n".format(k_iter, 3 * sub_n_way, k_iter))

    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    sum_outputs = 0

    for i in range(k_iter):
        inputs, targets = make_oneshot_task(sub_n_way)
        sum_outputs = len(inputs[0])

        probs = model.predict(inputs, verbose=0)
        for pos in range(len(probs)):
            if probs[pos][0] >= 0.65:
                probs[pos][0] = 1
            else:
                probs[pos][0] = 0

            if targets[pos] == 1 and probs[pos][0] == 1:
                true_positive += 1
            if targets[pos] == 1 and probs[pos][0] == 0:
                false_negative += 1
            if targets[pos] == 0 and probs[pos][0] == 1:
                false_positive += 1
            if targets[pos] == 0 and probs[pos][0] == 0:
                true_negative += 1

    total_predictions = (k_iter * sum_outputs)

    accuracy = (true_positive + true_negative) / total_predictions
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f_measure = 2 * (precision * recall) / (precision + recall)

    accuracy = round(accuracy * 100, 0)
    precision = round(precision * 100, 0)
    recall = round(recall * 100, 0)
    f_measure = round(f_measure * 100, 0)

    print(f"For {k_iter} iterations of {3 * sub_n_way} way one-shot tasks you got:\n"
          f"accuracy: {accuracy}% (correct/total no of predictions)\n"
          f"precision: {precision}% (What proportion of positive identifications was actually correct)\n"
          f"recall: {recall}% (What proportion of actual positives was identified correctly?)\n"
          f"f-measure: {f_measure}% (combined metric, the more the better)\n")
    return accuracy


work_model = get_siamese_model((output_df.shape[1],))
work_model.summary()

work_model.load_weights("./weights/weights.400.h5")

optimizer = Adam(learning_rate=0.001)
work_model.compile(loss="binary_crossentropy", optimizer=optimizer)

# Hyper parameters
evaluate_every = 50  # interval for evaluating on one-shot tasks
sub_batch_size = 10  # training batch size, will be * 3
n_iter = 500  # total number of training iterations
sub_N_way = 10  # testing batch size, will be 1 exact, (2*N) - 1 approx, 2*N no match
k_iter = 10  # how many one-shot tasks to validate on per testing phase
best = -1

# run training cycle

# model_path = './weights/'
# print("Starting training process!")
# print("-------------------------------------")
# t_start = time.time()
# for i in range(1, n_iter + 1):
#     (inputs, targets) = get_batch(sub_batch_size)
#     loss = work_model.train_on_batch(inputs, targets)
#     if i % evaluate_every == 0:
#         print("\n ------------- \n")
#         print("Time for {0} iterations: {1} mins".format(i, (time.time() - t_start) / 60.0))
#         print("Train Loss: {0}".format(loss))
#         val_acc = test_oneshot(work_model, sub_N_way, k_iter)
#         work_model.save_weights(os.path.join(model_path, 'weights.{}.h5'.format(i)))
#         if val_acc >= best:
#             print("Current best: {0}, previous best: {1}".format(val_acc, best))
#             best = val_acc

# run user testing  cycle
print("starting to predict:")
encoded_data = pd.read_csv('output_df_encoded.csv')

for idx in range(encoded_data.shape[0]):
    offer = encoded_data.iloc[idx].values
    user = user_input_df.iloc[0].values

    pair_to_predict = [np.asarray([offer]), np.asarray([user])]
    probs = work_model.predict(pair_to_predict, verbose=0)

    if probs[0][0] >= 0.9:
        print("-----------------------------------")
        print(probs)
        print(url_df.iloc[idx])
        offer_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
        offer_vector.loc[encoded_data.index[0]] = offer
        for column in offer_vector.columns:
            if offer_vector[column].iloc[0] == 1:
                print(column, end=', ')
        print('')
        user_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns)
        user_vector.loc[encoded_data.index[0]] = user
        for column in user_vector.columns:
            if user_vector[column].iloc[0] == 1:
                print(column, end=', ')
        print('')
        print("-----------------------------------")