Source code for classifications.merge_bonsai_concordances

import argparse
import os

import pandas as pd

import classifications


[docs] def restructure_dataframe(original_df, folder_name, suffix): columns = original_df.columns[:2] final_df = pd.DataFrame() # Rename columns based on folder_name final_df["tree_bonsai_activitytype"] = ( original_df["tree_bonsai"] if folder_name == "activitytype" else None ) final_df["tree_bonsai_flowobject"] = ( original_df["tree_bonsai"] if folder_name == "flowobject" else None ) for col in columns: if folder_name == "activitytype": final_df["tree_other_activitytype"] = original_df[col] final_df["tree_other_flowobject"] = None # Ensure flowobject column is None elif folder_name == "flowobject": final_df[ "tree_other_activitytype" ] = None # Ensure activitytype column is None final_df["tree_other_flowobject"] = original_df[col] final_df["comment"] = original_df["comment"] final_df["skos_uri"] = original_df["skos_uri"] final_df["other_classification"] = suffix.replace("tree_", "") # Replace NaN with None for clarity final_df = final_df.where(pd.notnull(final_df), None) return final_df
[docs] def main(): parser = argparse.ArgumentParser( description="Merge all Bonsai concordance tables into one" ) parser.add_argument( "--output_dir", type=str, default="output", help="Directory to save the updated DataFrames", ) args = parser.parse_args() # (table, other_classification_name) attributes = dir(classifications.activitytype.datapackage) # value = getattr(getattr(classifications, 'activitytype.datapackage'), attr_string) conc_activitytype = [ ( getattr( getattr(classifications, "activitytype"), "datapackage" ).__getattribute__(attr), attr[len("conc_bonsai_") :], ) for attr in attributes if attr.startswith("conc_bonsai_") ] attributes = dir(classifications.flowobject.datapackage) conc_flowobject = [ ( getattr( getattr(classifications, "flowobject"), "datapackage" ).__getattribute__(attr), attr[len("conc_bonsai_") :], ) for attr in attributes if attr.startswith("conc_bonsai_") ] attributes = dir(classifications.flow.datapackage) conc_flow = [ ( getattr(getattr(classifications, "flow"), "datapackage").__getattribute__( attr ), attr[len("concpair_bonsai_") :], ) for attr in attributes if attr.startswith("concpair_bonsai_") ] # Desired column order new_order = [ "activitytype_from", "flowobject_from", "activitytype_to", "flowobject_to", "classification_from", "classification_to", "comment", "skos_uri", ] conc_merged = conc_flow[0][0] for c in conc_flow[1:]: conc_merged = pd.concat([conc_merged, c[0]], ignore_index=True) for c in conc_activitytype: conc_merged = pd.concat([conc_merged, c[0]], ignore_index=True) for c in conc_flowobject: conc_merged = pd.concat([conc_merged, c[0]], ignore_index=True) conc_merged = conc_merged[new_order] print(conc_merged) # Create the output directory if it doesn't exist os.makedirs(args.output_dir, exist_ok=True) output_path = f"{args.output_dir}/merged_concordance.csv" # Save each DataFrame in the result as a CSV file only if it has changed conc_merged.to_csv(output_path, index=False)
if __name__ == "__main__": main()