import os import re import numpy as np import pandas as pd import numpy.random as rng from sklearn.preprocessing import MultiLabelBinarizer import PySimpleGUI as sg from keras.layers import Input from keras.models import Sequential from keras.layers.core import Lambda, Dense from keras.regularizers import l2 from keras.models import Model from keras import backend as K from keras.optimizers import Adam # setup path to cached processed files with structured data current_path = os.path.dirname(__file__) file_source_relative_path = '\\jobs_web_scraper\\jobs_web_scraper\\spiders\\' scraped_data_path = current_path + file_source_relative_path output_df = pd.DataFrame() # iterate file system and find scraped data csv files and merge them into a df for root, dirs, files in os.walk(scraped_data_path): for directory in dirs: if re.match('.*scraped_data.*', directory): data_dir = scraped_data_path + directory for sub_root, sub_dirs, sub_files in os.walk(data_dir): for sub_file in sub_files: if re.match('.*complete_scraped_jobs_unfiltered.*', sub_file): to_be_appended = pd.read_csv(data_dir + '\\' + sub_file) output_df = pd.concat([output_df, to_be_appended], ignore_index=True) # data cleaning # drop duplicates output_df.drop_duplicates(inplace=True) # dropping useless columns output_df.drop(['main_text', 'tags', 'Firemní benefity', 'Jazyky', 'Lokalita', 'Určeno pro', 'Forma spolupráce'], axis=1, inplace=True) # merging similiar columns and deleting the originals cols = ['Klíčové dovednosti', 'Technologie používané na pozici', 'Ostatní dovednosti'] output_df["skill_req"] = output_df[cols].apply(lambda x: '; '.join(x.dropna()), axis=1) output_df.drop(['Klíčové dovednosti', 'Technologie používané na pozici', 'Ostatní dovednosti'], axis=1, inplace=True) output_df['skill_req'] = output_df['skill_req'].replace([''], np.NaN) # fill nan output_df[['Odměna', 'Požadovaná zkušenost']] = output_df[['Odměna', 'Požadovaná zkušenost']].fillna('dohodou') # drop remaining NA output_df.dropna(inplace=True) # pass url to a separate list url_df = output_df['url'] # drop identification info which is not useful for ML output_df.drop(['company', 'url', 'Poslední aktualizace nabídky', 'position'], axis=1, inplace=True) # data cleaning # location # delete numbers from names of locations and delete duplicates, ie.: Praha, Praha 1 output_df['location'] = output_df['location'].str.replace('[0-9]+', '', regex=True) output_df['location'] = output_df['location'].str.replace(';', ' ', regex=True) output_df['location'] = output_df['location'].str.replace(r'\b(\w+)\s+\1\b', r'\1', regex=True) output_df['location'] = output_df['location'].str.replace(' ', '; ', regex=True) output_df['location'] = output_df['location'].str.replace(';$', '', regex=True) # odměna # delete 'a vice' and whitespaces output_df['Odměna'] = output_df['Odměna'].str.replace('a více', '', regex=True) output_df['Odměna'] = output_df['Odměna'].str.replace('\s', '', regex=True) # normalise all numbers to be in czk and per month for ind in output_df.index: # check the format is_eur = re.match('.*€.*', output_df['Odměna'][ind]) is_usd = re.match('.*\$.*', output_df['Odměna'][ind]) is_hourly = re.match('.*/hodina.*', output_df['Odměna'][ind]) is_dohodou = re.match('.*dohodou.*', output_df['Odměna'][ind]) # find all available numbers and do the normalization if not is_dohodou: min_pay = re.search('^[0-9]+', output_df['Odměna'][ind]).group() try: max_pay = re.search('-[0-9]+', output_df['Odměna'][ind]).group() except AttributeError: max_pay = None if max_pay is not None: max_pay = max_pay.replace('-', '') if max_pay is None: max_pay = min_pay min_pay = 0 min_pay = float(min_pay) max_pay = float(max_pay) if is_eur: min_pay *= 24.5 max_pay *= 24.5 if is_usd: min_pay *= 23 max_pay *= 23 if is_hourly: min_pay *= 163.05 max_pay *= 163.05 min_pay = round(min_pay, 2) max_pay = round(max_pay, 2) if min_pay == 0.0 and max_pay == 0.0: min_pay = -1 max_pay = -1 else: min_pay = -1 max_pay = -1 output_df['Odměna'][ind] = str(min_pay) + '; ' + str(max_pay) # list2features # transform list containing attributes with items delimited by ; into a python list cols = list(output_df) for ind in output_df.index: for col in cols: value = output_df[col][ind] tmp_list = re.findall('[^;]+', value) for index in range(len(tmp_list)): tmp_list[index] = tmp_list[index].strip() tmp_list[index] = tmp_list[index].lower() output_df[col][ind] = tmp_list # one hot encoding and creation of dictionary for input form dictionary = {} for col in cols: if col != "Odměna": mlb = MultiLabelBinarizer() output_df = output_df.join(pd.DataFrame(mlb.fit_transform(output_df.pop(col)), columns=mlb.classes_, index=output_df.index)) dictionary[col] = mlb.classes_ # reduce dataframe shape based on sum of '1' occurrences # columns with too few occurrences get dropped as they provide very little information gain cols = list(output_df) counter_of_occurrences = [] for col in cols: if col != "Odměna": for name, cnt in output_df[col].value_counts().items(): if name == 1: counter_of_occurrences.append([col, cnt]) counter_of_occurrences = sorted(counter_of_occurrences, key=lambda x: x[1]) for node in counter_of_occurrences[0:-300]: # here you can define dimensionality of resulting dataframe for training output_df.drop([node[0]], axis=1, inplace=True) # delete deprecated columns from form dictionary for key, sub_list in dictionary.items(): if node[0] in sub_list: tmp_sub_list = np.delete(sub_list, np.where(sub_list == node[0]), axis=0) dictionary[key] = tmp_sub_list # split odměna output_df['min_pay'] = '' output_df['max_pay'] = '' for idx in output_df.index: interval = output_df.loc[idx, 'Odměna'] output_df.loc[idx, 'min_pay'] = interval[0] output_df.loc[idx, 'max_pay'] = interval[1] output_df.drop(['Odměna'], axis=1, inplace=True) # convert odmena to integers for i in output_df.columns: try: output_df[[i]] = output_df[[i]].astype(float).astype(int) except: pass # save encoded data to a tmp file # print('prazdnych radku: ', sum) print('dimenzionalita: ', output_df.shape[1]) output_df.to_csv(f"output_df_encoded.csv", index=False) # create GUI layout = [ [sg.Text("Zadejte prosím svoje prefernce")], [sg.Frame(title="Lokalita", layout=[[sg.Combo(list(dictionary['location']), key='loc', readonly=True, enable_events=True, default_value='(none)')]])], [sg.Frame(title="Úvazek", layout=[[sg.Combo(list(dictionary['Úvazek']), key='uvaz', readonly=True, enable_events=True, default_value='(none)')]])], [sg.Frame(title="Zkušenost", layout=[[sg.Combo(list(dictionary['Požadovaná zkušenost']), key='exp', readonly=True, enable_events=True, default_value='(none)')]])], [sg.Frame(title="Klíčové dovednosti", layout=[[sg.Combo(list(dictionary['skill_req']), key='skill1', readonly=True, enable_events=True, default_value='(none)')], [sg.Combo(list(dictionary['skill_req']), key='skill2', readonly=True, enable_events=True, default_value='(none)')], [sg.Combo(list(dictionary['skill_req']), key='skill3', readonly=True, enable_events=True, default_value='(none)')]])], [sg.Frame(title="Odměna", layout=[ [sg.Text("minimální hodnota")], [sg.Input(default_text='dohodou', key='min', enable_events=True)], [sg.Text("maximální hodnota")], [sg.Input(default_text='dohodou', key='max', enable_events=True)]])], [sg.Button("OK", enable_events=True)] # ,[sg.Output(size=(45,5))] ] # Create the window window = sg.Window("Vstupní formulář", layout) # Create an event loop while True: event, values = window.read() user_input_df = pd.DataFrame(0, index=np.arange(1), columns=output_df.columns) # Evaluate input if user presses OK if event == "OK": location = values['loc'] work_hrs = values['uvaz'] experience = values['exp'] skill1 = values['skill1'] skill2 = values['skill2'] skill3 = values['skill3'] wage_min = values['min'] wage_max = values['max'] if location == '(none)' or work_hrs == '(none)' or experience == '(none)' or skill1 == '(none)' \ or skill2 == '(none)' or skill3 == '(none)': sg.popup('Vyplňte prosím všechny položky') continue if skill1 == skill2 or skill2 == skill3 or skill1 == skill3: sg.popup('Vyberte rozdílné klíčové dovednosti') continue if (wage_max == 'dohodou' and wage_min != 'dohodou') or (wage_max != 'dohodou' and wage_min == 'dohodou'): sg.popup('Odměna zadaná ve špatném formátu, zadejte dohodou:dohodou, nebo číslo:číslo') continue if not ((wage_max.isnumeric() and wage_min.isnumeric()) or (wage_min == 'dohodou' and wage_max == 'dohodou')): sg.popup('Odměna zadaná ve špatném formátu, zadejte dohodou:dohodou, nebo číslo:číslo') continue if wage_min != 'dohodou' and wage_max != 'dohodou': wage_min = float(wage_min) wage_max = float(wage_max) else: wage_max = -1 wage_min = -1 if wage_min > wage_max: sg.popup('Odměna zadaná ve špatném formátu, minimální hodnota musí být větší než maximální hodnota') continue user_input_df[location][0] = 1 user_input_df[work_hrs][0] = 1 user_input_df[experience][0] = 1 user_input_df[skill1][0] = 1 user_input_df[skill2][0] = 1 user_input_df[skill3][0] = 1 user_input_df['min_pay'][0] = wage_min user_input_df['max_pay'][0] = wage_max user_input_df.to_csv(f"user_input_df_encoded.csv", index=False) break # End program if user closes window if event == sg.WIN_CLOSED: break window.close() def generate_exact_match(reference_vector): # prepare empty user vector artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns) # setup flags that limit how many of the 1 actually get transferred to the user vector location_chosen = False uvazek_chosen = False zkusenost_chosen = False skill_req_counter = 0 # assemble user vector for column in reference_vector.columns: if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1: location_chosen = True artif_user_vector[column][0] = 1 if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1: uvazek_chosen = True artif_user_vector[column][0] = 1 if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1: zkusenost_chosen = True artif_user_vector[column][0] = 1 if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1: skill_req_counter += 1 artif_user_vector[column][0] = 1 # add a lil bit of rng to odmena # get the original values minimum_pay = reference_vector['min_pay'][0] maximum_pay = reference_vector['max_pay'][0] # if dohodou, then: if minimum_pay == -1 and maximum_pay == -1: coin_toss = rng.randint(0, 2) # keep dohodou if coin_toss == 0: artif_user_vector['min_pay'][0] = -1 artif_user_vector['max_pay'][0] = -1 # generate random value else: tmp_min = rng.randint(0, 200000) tmp_max = rng.randint(0, 200000) if tmp_max < tmp_min: swap = tmp_max tmp_max = tmp_min tmp_min = swap artif_user_vector['min_pay'][0] = tmp_min artif_user_vector['max_pay'][0] = tmp_max # else generate values within given interval else: interval_midpoint = (maximum_pay + minimum_pay) // 2 artif_user_vector['min_pay'][0] = rng.randint(minimum_pay, interval_midpoint) artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay) return artif_user_vector.iloc[0].values def generate_approx_match(reference_vector): # prepare empty user vector artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns) # setup flags that limit how many of the 1 actually get transferred to the user vector location_chosen = False uvazek_chosen = False zkusenost_chosen = False skill_req_counter = 0 # assemble user vector for column in reference_vector.columns: if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1: location_chosen = True artif_user_vector[column][0] = 1 if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1: uvazek_chosen = True artif_user_vector[column][0] = 1 # random pozadovana zkusenost, doesn't have to match, but only if dohodou if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1: zkusenost_chosen = True if column == 'dohodou': while True: coin_toss = rng.randint(0, len(dictionary['Požadovaná zkušenost'])) choice = dictionary['Požadovaná zkušenost'][coin_toss] if choice != 'dohodou': artif_user_vector[choice][0] = 1 break else: artif_user_vector[column][0] = 1 # one of three random skill requirement, doesn't have to match exactly if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1: skill_req_counter += 1 # randomise the last skill requirement if skill_req_counter == 2: coin_toss = rng.randint(0, len(dictionary['skill_req'])) random_skill = dictionary['skill_req'][coin_toss] artif_user_vector[random_skill][0] = 1 # others should match else: artif_user_vector[column][0] = 1 # add a lil bit of rng to odmena # get the original values minimum_pay = reference_vector['min_pay'][0] maximum_pay = reference_vector['max_pay'][0] # if dohodou, then: if minimum_pay == -1 and maximum_pay == -1: coin_toss = rng.randint(0, 2) # keep dohodou if coin_toss == 0: artif_user_vector['min_pay'][0] = -1 artif_user_vector['max_pay'][0] = -1 # generate random value else: tmp_min = rng.randint(0, 200000) tmp_max = rng.randint(0, 200000) if tmp_max < tmp_min: swap = tmp_max tmp_max = tmp_min tmp_min = swap artif_user_vector['min_pay'][0] = tmp_min artif_user_vector['max_pay'][0] = tmp_max # else generate values within given interval (with some allowed overlap for approx match simulation) else: interval_midpoint = (maximum_pay + minimum_pay) // 2 artif_user_vector['min_pay'][0] = rng.randint(0, interval_midpoint) artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay + 100000) return artif_user_vector.iloc[0].values def generate_no_match(reference_vector): # prepare empty user vector artif_user_vector = pd.DataFrame(0, index=np.arange(1), columns=reference_vector.columns) # setup flags that limit how many of the 1 actually get transferred to the user vector location_chosen = False uvazek_chosen = False zkusenost_chosen = False skill_req_counter = 0 # assemble user vector randomise everything for column in reference_vector.columns: if column in dictionary['location'] and not location_chosen and reference_vector[column][0] == 1: location_chosen = True coin_toss = rng.randint(0, len(dictionary['location'])) random_location = dictionary['location'][coin_toss] artif_user_vector[random_location][0] = 1 if column in dictionary['Úvazek'] and not uvazek_chosen and reference_vector[column][0] == 1: uvazek_chosen = True coin_toss = rng.randint(0, len(dictionary['Úvazek'])) random_uvazek = dictionary['Úvazek'][coin_toss] artif_user_vector[random_uvazek][0] = 1 if column in dictionary['Požadovaná zkušenost'] and not zkusenost_chosen and reference_vector[column][0] == 1: zkusenost_chosen = True coin_toss = rng.randint(0, len(dictionary['Požadovaná zkušenost'])) random_experience = dictionary['Požadovaná zkušenost'][coin_toss] artif_user_vector[random_experience][0] = 1 if column in dictionary['skill_req'] and skill_req_counter < 3 and reference_vector[column][0] == 1: skill_req_counter += 1 coin_toss = rng.randint(0, len(dictionary['skill_req'])) random_skill = dictionary['skill_req'][coin_toss] artif_user_vector[random_skill][0] = 1 # add a lil bit of rng to odmena # get the original values minimum_pay = reference_vector['min_pay'][0] maximum_pay = reference_vector['max_pay'][0] # if dohodou, then: if minimum_pay == -1 and maximum_pay == -1: coin_toss = rng.randint(0, 2) # keep dohodou if coin_toss == 0: artif_user_vector['min_pay'][0] = -1 artif_user_vector['max_pay'][0] = -1 # generate random value else: tmp_min = rng.randint(0, 200000) tmp_max = rng.randint(0, 200000) if tmp_max < tmp_min: swap = tmp_max tmp_max = tmp_min tmp_min = swap artif_user_vector['min_pay'][0] = tmp_min artif_user_vector['max_pay'][0] = tmp_max # else generate values within given interval (with some allowed overlap for approx match simulation) else: interval_midpoint = (maximum_pay + minimum_pay) // 2 artif_user_vector['min_pay'][0] = rng.randint(0, interval_midpoint) artif_user_vector['max_pay'][0] = rng.randint(interval_midpoint, maximum_pay + 100000) return artif_user_vector.iloc[0].values def get_batch(sub_batch_size): # get the encoded data encoded_data = pd.read_csv('output_df_encoded.csv') # get batch size batch_size = 4 * sub_batch_size # prepare empty matrix for reference (original) and user (artificial) vector pairs vector_size = encoded_data.shape[1] pair_matrix = [np.zeros((batch_size, vector_size), dtype=int) for i in range(2)] # prepare empty matrix for expected results result_matrix = np.zeros((batch_size,), dtype=int) # add exact matches for i in range(2 * sub_batch_size): # populate pair matrix with reference vectors rng_idx = rng.randint(0, encoded_data.shape[0]) pair_matrix[0][i] = encoded_data.iloc[rng_idx] # original vector result_matrix[i] = 1 # populate result # separate reference vector to be passed to function reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx] # create artificial user vector artif_user_vector = generate_exact_match(reference_vector) # populate matrix with artificial user vectors pair_matrix[1][i] = artif_user_vector result_matrix[i] = 1 # add approximate matches # for i in range(sub_batch_size, 2 * sub_batch_size): # # populate pair matrix with reference vectors # rng_idx = rng.randint(0, encoded_data.shape[0]) # pair_matrix[0][i] = encoded_data.iloc[rng_idx] # original vector # result_matrix[i] = 1 # populate result # # # separate reference vector to be passed to function # reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) # reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx] # # create artificial user vector # artif_user_vector = generate_approx_match(reference_vector) # # # populate matrix with artificial user vectors # pair_matrix[1][i] = artif_user_vector # result_matrix[i] = 1 for i in range(2 * sub_batch_size, 4 * sub_batch_size): # populate pair matrix with reference vectors rng_idx = rng.randint(0, encoded_data.shape[0]) pair_matrix[0][i] = encoded_data.iloc[rng_idx] # original vector result_matrix[i] = 1 # populate result # separate reference vector to be passed to function reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) reference_vector.loc[encoded_data.index[0]] = encoded_data.iloc[rng_idx] # create artificial user vector artif_user_vector = generate_no_match(reference_vector) # populate matrix with artificial user vectors pair_matrix[1][i] = artif_user_vector result_matrix[i] = 0 return pair_matrix, result_matrix def initialize_weights(shape, dtype=None): return np.random.normal(loc=0.0, scale=1e-2, size=shape) def initialize_bias(shape, dtype=None): return np.random.normal(loc=0.5, scale=1e-2, size=shape) def get_siamese_model(input_shape): # Define the tensors for the two input vectors left_input = Input(input_shape) right_input = Input(input_shape) # initialise the network model = Sequential() # Dense hidden layer hidden_l_perceptron_no = round(input_shape[0] * (2 / 3)) model.add(Dense(hidden_l_perceptron_no, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4))) # Generate the encodings for the two vectors encoded_l = model(left_input) encoded_r = model(right_input) # Add a customized layer to compute the absolute difference between the encodings L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1])) L1_distance = L1_layer([encoded_l, encoded_r]) # Add a dense layer with a sigmoid unit to generate the similarity score prediction = Dense(1, activation='sigmoid', bias_initializer=initialize_bias)(L1_distance) # Connect the inputs with the outputs siamese_net = Model(inputs=[left_input, right_input], outputs=prediction) # return the model return siamese_net def make_oneshot_task(sub_n_way): """Create pairs of offer vector, support set of dummy user vectors for testing N way one-shot learning.""" encoded_data = pd.read_csv('output_df_encoded.csv') # obtain random original offer vector rng_idx = rng.randint(0, encoded_data.shape[0]) offer_vector = encoded_data.iloc[rng_idx].values # original offer vector # separate reference vector to be passed to function reference_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) reference_vector.loc[encoded_data.index[0]] = offer_vector # generate artificial user vectors support_set = [] targets = [] multiplied_offer_vector = [] # exact match artif_user_vector = generate_exact_match(reference_vector) support_set.append(artif_user_vector) targets.append(1) # approx match multiplied_offer_vector.append(offer_vector) for i in range(2*sub_n_way - 1): artif_user_vector = generate_approx_match(reference_vector) support_set.append(artif_user_vector) targets.append(1) multiplied_offer_vector.append(offer_vector) # no match for i in range(2 * sub_n_way): artif_user_vector = generate_no_match(reference_vector) support_set.append(artif_user_vector) targets.append(0) multiplied_offer_vector.append(offer_vector) pairs = [np.asarray(multiplied_offer_vector), np.asarray(support_set)] return pairs, np.asarray(targets) def test_oneshot(model, sub_n_way, k_iter): """Test average N way oneshot learning accuracy of a siamese neural net over k one-shot tasks""" print("Evaluating model on {} random {} way one-shot learning tasks\n".format(k_iter, 3 * sub_n_way, k_iter)) true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 sum_outputs = 0 for i in range(k_iter): inputs, targets = make_oneshot_task(sub_n_way) sum_outputs = len(inputs[0]) probs = model.predict(inputs, verbose=0) for pos in range(len(probs)): if probs[pos][0] >= 0.65: probs[pos][0] = 1 else: probs[pos][0] = 0 if targets[pos] == 1 and probs[pos][0] == 1: true_positive += 1 if targets[pos] == 1 and probs[pos][0] == 0: false_negative += 1 if targets[pos] == 0 and probs[pos][0] == 1: false_positive += 1 if targets[pos] == 0 and probs[pos][0] == 0: true_negative += 1 total_predictions = (k_iter * sum_outputs) accuracy = (true_positive + true_negative) / total_predictions precision = true_positive / (true_positive + false_positive) recall = true_positive / (true_positive + false_negative) f_measure = 2 * (precision * recall) / (precision + recall) accuracy = round(accuracy * 100, 0) precision = round(precision * 100, 0) recall = round(recall * 100, 0) f_measure = round(f_measure * 100, 0) print(f"For {k_iter} iterations of {3 * sub_n_way} way one-shot tasks you got:\n" f"accuracy: {accuracy}% (correct/total no of predictions)\n" f"precision: {precision}% (What proportion of positive identifications was actually correct)\n" f"recall: {recall}% (What proportion of actual positives was identified correctly?)\n" f"f-measure: {f_measure}% (combined metric, the more the better)\n") return accuracy work_model = get_siamese_model((output_df.shape[1],)) work_model.summary() work_model.load_weights("./weights/weights.400.h5") optimizer = Adam(learning_rate=0.001) work_model.compile(loss="binary_crossentropy", optimizer=optimizer) # Hyper parameters evaluate_every = 50 # interval for evaluating on one-shot tasks sub_batch_size = 10 # training batch size, will be * 3 n_iter = 500 # total number of training iterations sub_N_way = 10 # testing batch size, will be 1 exact, (2*N) - 1 approx, 2*N no match k_iter = 10 # how many one-shot tasks to validate on per testing phase best = -1 # run training cycle # model_path = './weights/' # print("Starting training process!") # print("-------------------------------------") # t_start = time.time() # for i in range(1, n_iter + 1): # (inputs, targets) = get_batch(sub_batch_size) # loss = work_model.train_on_batch(inputs, targets) # if i % evaluate_every == 0: # print("\n ------------- \n") # print("Time for {0} iterations: {1} mins".format(i, (time.time() - t_start) / 60.0)) # print("Train Loss: {0}".format(loss)) # val_acc = test_oneshot(work_model, sub_N_way, k_iter) # work_model.save_weights(os.path.join(model_path, 'weights.{}.h5'.format(i))) # if val_acc >= best: # print("Current best: {0}, previous best: {1}".format(val_acc, best)) # best = val_acc # run user testing cycle print("starting to predict:") encoded_data = pd.read_csv('output_df_encoded.csv') for idx in range(encoded_data.shape[0]): offer = encoded_data.iloc[idx].values user = user_input_df.iloc[0].values pair_to_predict = [np.asarray([offer]), np.asarray([user])] probs = work_model.predict(pair_to_predict, verbose=0) if probs[0][0] >= 0.9: print("-----------------------------------") print(probs) print(url_df.iloc[idx]) offer_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) offer_vector.loc[encoded_data.index[0]] = offer for column in offer_vector.columns: if offer_vector[column].iloc[0] == 1: print(column, end=', ') print('') user_vector = pd.DataFrame(0, index=np.arange(1), columns=encoded_data.columns) user_vector.loc[encoded_data.index[0]] = user for column in user_vector.columns: if user_vector[column].iloc[0] == 1: print(column, end=', ') print('') print("-----------------------------------")