Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import pickle | |
| import torch | |
| import zipfile | |
| from typing import List, Union, Type, Dict | |
| from pydantic import BaseModel | |
| from .pytorch_models import * | |
| PandasDataFrame = Type[pd.DataFrame] | |
| PandasSeries = Type[pd.Series] | |
| def get_or_create_env_var(var_name, default_value): | |
| # Get the environment variable if it exists | |
| value = os.environ.get(var_name) | |
| # If it doesn't exist, set it to the default value | |
| if value is None: | |
| os.environ[var_name] = default_value | |
| value = default_value | |
| return value | |
| # Retrieving or setting output folder | |
| env_var_name = 'GRADIO_OUTPUT_FOLDER' | |
| default_value = 'output/' | |
| output_folder = get_or_create_env_var(env_var_name, default_value) | |
| print(f'The value of {env_var_name} is {output_folder}') | |
| # + | |
| ''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio, | |
| token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio | |
| details here: https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings''' | |
| fuzzy_scorer_used = "token_set_ratio" | |
| fuzzy_match_limit = 85 | |
| fuzzy_search_addr_limit = 20 | |
| filter_to_lambeth_pcodes= True | |
| standardise = False | |
| if standardise == True: | |
| std = "_std" | |
| if standardise == False: | |
| std = "_not_std" | |
| dataset_name = "data" + std | |
| suffix_used = dataset_name + "_" + fuzzy_scorer_used | |
| # https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model | |
| # Uncomment these lines for the tensorflow model | |
| #model_type = "tf" | |
| #model_stub = "addr_model_out_lon" | |
| #model_version = "00000001" | |
| #file_step_suffix = "550" # I add a suffix to output files to be able to separate comparisons of test data from the same model with different steps e.g. '350' indicates a model that has been through 350,000 steps of training | |
| # Uncomment these lines for the pytorch model | |
| model_type = "lstm" | |
| model_stub = "pytorch/lstm" | |
| model_version = "" | |
| file_step_suffix = "" | |
| data_sample_size = 476887 | |
| N_EPOCHS = 10 | |
| max_predict_len = 12000 | |
| word_to_index = {} | |
| cat_to_idx = {} | |
| vocab = [] | |
| device = "cpu" | |
| global labels_list | |
| labels_list = [] | |
| ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..')) | |
| # If in a non-standard location (e.g. on AWS Lambda Function URL, then save model to tmp drive) | |
| if output_folder == "output/": | |
| out_model_dir = ROOT_DIR | |
| print(out_model_dir) | |
| else: | |
| out_model_dir = output_folder[:-1] | |
| print(out_model_dir) | |
| model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version) | |
| model_path = os.path.join(model_dir_name, "saved_model.zip") | |
| print("Model zip path: ", model_path) | |
| if os.path.exists(model_path): | |
| os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues | |
| device = "cpu" | |
| ## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on | |
| ''' Load pre-trained model ''' | |
| with zipfile.ZipFile(model_path,"r") as zip_ref: | |
| zip_ref.extractall(out_model_dir) | |
| # if model_stub == "addr_model_out_lon": | |
| #import tensorflow as tf | |
| #tf.config.list_physical_devices('GPU') | |
| # # Number of labels in total (+1 for the blank category) | |
| # n_labels = len(labels_list) + 1 | |
| # # Allowable characters for the encoded representation | |
| # vocab = list(string.digits + string.ascii_lowercase + string.punctuation + string.whitespace) | |
| # #print("Loading TF model") | |
| # exported_model = tf.saved_model.load(model_dir_name) | |
| # labels_list = [ | |
| # 'SaoText', # 1 | |
| # 'SaoStartNumber', # 2 | |
| # 'SaoStartSuffix', # 3 | |
| # 'SaoEndNumber', # 4 | |
| # 'SaoEndSuffix', # 5 | |
| # 'PaoText', # 6 | |
| # 'PaoStartNumber', # 7 | |
| # 'PaoStartSuffix', # 8 | |
| # 'PaoEndNumber', # 9 | |
| # 'PaoEndSuffix', # 10 | |
| # 'Street', # 11 | |
| # 'PostTown', # 12 | |
| # 'AdministrativeArea', #13 | |
| # 'Postcode' # 14 | |
| # ] | |
| if "pytorch" in model_stub: | |
| labels_list = [ | |
| 'SaoText', # 1 | |
| 'SaoStartNumber', # 2 | |
| 'SaoStartSuffix', # 3 | |
| 'SaoEndNumber', # 4 | |
| 'SaoEndSuffix', # 5 | |
| 'PaoText', # 6 | |
| 'PaoStartNumber', # 7 | |
| 'PaoStartSuffix', # 8 | |
| 'PaoEndNumber', # 9 | |
| 'PaoEndSuffix', # 10 | |
| 'Street', # 11 | |
| 'PostTown', # 12 | |
| 'AdministrativeArea', #13 | |
| 'Postcode', # 14 | |
| 'IGNORE' | |
| ] | |
| if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") : | |
| # Load vocab and word_to_index | |
| with open(out_model_dir + "/vocab.txt", "r") as f: | |
| vocab = eval(f.read()) | |
| with open(out_model_dir + "/word_to_index.txt", "r") as f: | |
| word_to_index = eval(f.read()) | |
| with open(out_model_dir + "/cat_to_idx.txt", "r") as f: | |
| cat_to_idx = eval(f.read()) | |
| VOCAB_SIZE = len(word_to_index) | |
| OUTPUT_DIM = len(cat_to_idx) + 1 # Number of classes/categories | |
| EMBEDDING_DIM = 48 | |
| DROPOUT = 0.1 | |
| PAD_TOKEN = 0 | |
| if model_type == "transformer": | |
| NHEAD = 4 | |
| NUM_ENCODER_LAYERS = 1 | |
| exported_model = TransformerClassifier(VOCAB_SIZE, EMBEDDING_DIM, NHEAD, NUM_ENCODER_LAYERS, OUTPUT_DIM, DROPOUT, PAD_TOKEN) | |
| elif model_type == "gru": | |
| N_LAYERS = 3 | |
| HIDDEN_DIM = 128 | |
| exported_model = TextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN) | |
| elif model_type == "lstm": | |
| N_LAYERS = 3 | |
| HIDDEN_DIM = 128 | |
| exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN) | |
| out_model_file_name = "output_model_" + str(data_sample_size) +\ | |
| "_" + str(N_EPOCHS) + "_" + model_type + ".pth" | |
| out_model_path = os.path.join(out_model_dir, out_model_file_name) | |
| print("Model location: ", out_model_path) | |
| exported_model.load_state_dict(torch.load(out_model_path, map_location=torch.device('cpu'), weights_only=False)) | |
| exported_model.eval() | |
| device='cpu' | |
| #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| exported_model.to(device) | |
| else: | |
| exported_model = [] #tf.keras.models.load_model(model_dir_name, compile=False) | |
| # Compile the model with a loss function and an optimizer | |
| #exported_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy']) | |
| else: exported_model = [] | |
| ### ADDRESS MATCHING FUNCTIONS | |
| # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits. | |
| batch_size = 10000 | |
| ref_batch_size = 20000 | |
| ### Fuzzy match method | |
| ''' https://recordlinkage.readthedocs.io/en/latest/ref_df-compare.html#recordlinkage.compare.String | |
| The Python Record Linkage Toolkit uses the jellyfish package for the Jaro, Jaro-Winkler, Levenshtein and Damerau- Levenshtein algorithms. | |
| Options are [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’] | |
| Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html ''' | |
| fuzzy_method = "jarowinkler" | |
| # Required overall match score for all columns to count as a match | |
| score_cut_off = 98.7 # 97.5 | |
| # I set a higher score cut off for nnet street blocking based on empirical data. Under this match value I was seeing errors. This value was (.99238), but set here to .995 to be maximally stringent. It is set in 'recordlinkage_funcs.py', score_based_match function | |
| score_cut_off_nnet_street = 99.5 # 99.238 | |
| # If there are no numbers in the address, then the matcher needs to get a perfect score (otherwise too many issues). | |
| no_number_fuzzy_match_limit = 100 | |
| # Reference data 'official' column names | |
| ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix", | |
| "SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber", | |
| "PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"] | |
| # Create a list of matching variables. Text columns will be fuzzy matched. | |
| matching_variables = ref_address_cols | |
| text_columns = ["Organisation", "PaoText", "Street", "PostTown", "Postcode"] | |
| # Modify relative importance of columns (weights) for the recordlinkage part of the match. Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important | |
| Organisation_weight = 0.1 # Organisation weight is very low just to resolve tie breakers for very similar addresses | |
| PaoStartNumber_weight = 2 | |
| SaoStartNumber_weight = 2 | |
| Street_weight = 2 | |
| PostTown_weight = 0 | |
| Postcode_weight = 0.5 | |
| AdministrativeArea_weight = 0 | |
| # - | |
| weight_vals = [1] * len(ref_address_cols) | |
| weight_keys = ref_address_cols | |
| weights = {weight_keys[i]: weight_vals[i] for i in range(len(weight_keys))} | |
| # + | |
| # Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important | |
| weights["Organisation"] = Organisation_weight | |
| weights["SaoStartNumber"] = SaoStartNumber_weight | |
| weights["PaoStartNumber"] = PaoStartNumber_weight | |
| weights["Street"] = Street_weight | |
| weights["PostTown"] = PostTown_weight | |
| weights["Postcode"] = Postcode_weight | |
| # Creating Pydantic basemodel class | |
| class MatcherClass(BaseModel): | |
| # Fuzzy/general attributes | |
| fuzzy_scorer_used: str | |
| fuzzy_match_limit: int | |
| fuzzy_search_addr_limit: int | |
| filter_to_lambeth_pcodes: bool | |
| standardise: bool | |
| suffix_used: str | |
| # Neural net attributes | |
| matching_variables: List[str] | |
| model_dir_name: str | |
| file_step_suffix: str | |
| exported_model: List | |
| fuzzy_method: str | |
| score_cut_off: float | |
| text_columns: List[str] | |
| weights: dict | |
| model_type: str | |
| labels_list: List[str] | |
| # These are variables that are added on later | |
| # Pytorch optional variables | |
| word_to_index: dict | |
| cat_to_idx: dict | |
| device: str | |
| vocab: List[str] | |
| # Join data | |
| file_name: str | |
| ref_name: str | |
| search_df: pd.DataFrame | |
| excluded_df: pd.DataFrame | |
| pre_filter_search_df: pd.DataFrame | |
| search_address_cols: List[str] | |
| search_postcode_col: List[str] | |
| search_df_key_field: str | |
| ref_df: pd.DataFrame | |
| ref_pre_filter: pd.DataFrame | |
| ref_address_cols: List[str] | |
| new_join_col: List[str] | |
| #in_joincol_list: List[str] | |
| existing_match_cols: List[str] | |
| standard_llpg_format: List[str] | |
| # Results attributes | |
| match_results_output: pd.DataFrame | |
| predict_df_nnet: pd.DataFrame | |
| # Other attributes generated during training | |
| compare_all_candidates: List[str] | |
| diag_shortlist: List[str] | |
| diag_best_match: List[str] | |
| results_on_orig_df: pd.DataFrame | |
| summary: str | |
| output_summary: str | |
| match_outputs_name: str | |
| results_orig_df_name: str | |
| search_df_after_stand: pd.DataFrame | |
| ref_df_after_stand: pd.DataFrame | |
| search_df_after_full_stand: pd.DataFrame | |
| ref_df_after_full_stand: pd.DataFrame | |
| search_df_after_stand_series: pd.Series | |
| ref_df_after_stand_series: pd.Series | |
| search_df_after_stand_series_full_stand: pd.Series | |
| ref_df_after_stand_series_full_stand: pd.Series | |
| # Abort flag if the matcher couldn't even get the results of the first match | |
| abort_flag: bool | |
| # This is to allow for Pandas DataFrame types as an argument | |
| class Config: | |
| # Allow for custom types such as Pandas DataFrames in the class | |
| arbitrary_types_allowed = True | |
| extra = 'allow' | |
| # Disable protected namespaces to avoid conflicts | |
| protected_namespaces = () | |
| # Creating an instance of MatcherClass | |
| InitMatch = MatcherClass( | |
| # Fuzzy/general attributes | |
| fuzzy_scorer_used = fuzzy_scorer_used, | |
| fuzzy_match_limit = fuzzy_match_limit, | |
| fuzzy_search_addr_limit = fuzzy_search_addr_limit, | |
| filter_to_lambeth_pcodes = filter_to_lambeth_pcodes, | |
| standardise = standardise, | |
| suffix_used = suffix_used, | |
| # Neural net attributes | |
| matching_variables = matching_variables, | |
| model_dir_name = model_dir_name, | |
| file_step_suffix = file_step_suffix, | |
| exported_model = [exported_model], | |
| fuzzy_method = fuzzy_method, | |
| score_cut_off = score_cut_off, | |
| text_columns = text_columns, | |
| weights = weights, | |
| model_type = model_type, | |
| labels_list = labels_list, | |
| # These are variables that are added on later | |
| # Pytorch optional variables | |
| word_to_index = word_to_index, | |
| cat_to_idx = cat_to_idx, | |
| device = device, | |
| vocab = vocab, | |
| # Join data | |
| file_name = '', | |
| ref_name = '', | |
| df_name = '', | |
| search_df = pd.DataFrame(), | |
| excluded_df = pd.DataFrame(), | |
| pre_filter_search_df = pd.DataFrame(), | |
| search_df_not_matched = pd.DataFrame(), | |
| search_df_cleaned = pd.DataFrame(), | |
| search_address_cols = [], | |
| search_postcode_col = [], | |
| search_df_key_field = 'index', | |
| ref_df = pd.DataFrame(), | |
| ref_df_cleaned = pd.DataFrame(), | |
| ref_pre_filter = pd.DataFrame(), | |
| ref_address_cols = [], | |
| new_join_col = [], | |
| #in_joincol_list = [], | |
| existing_match_cols = [], | |
| standard_llpg_format = [], | |
| # Results attributes | |
| match_results_output = pd.DataFrame(), | |
| predict_df_nnet = pd.DataFrame(), | |
| # Other attributes generated during training | |
| compare_all_candidates = [], | |
| diag_shortlist = [], | |
| diag_best_match = [], | |
| results_on_orig_df = pd.DataFrame(), | |
| summary = "", | |
| output_summary = "", | |
| match_outputs_name = "", | |
| results_orig_df_name = "", | |
| # Post dataset preparation variables | |
| search_df_after_stand = pd.DataFrame(), | |
| ref_df_after_stand = pd.DataFrame(), | |
| search_df_after_stand_series = pd.Series(), | |
| ref_df_after_stand_series = pd.Series(), | |
| search_df_after_full_stand = pd.DataFrame(), | |
| ref_df_after_full_stand = pd.DataFrame(), | |
| search_df_after_stand_series_full_stand = pd.Series(), | |
| ref_df_after_stand_series_full_stand = pd.Series(), | |
| # Abort flag if the matcher couldn't even get the results of the first match | |
| abort_flag = False | |
| ) |