go_utils.plot
1import logging 2import math 3 4import matplotlib.pyplot as plt 5import pandas as pd 6import seaborn as sns 7 8 9def plot_freq_bar(df, protocol, column, title, plot_type="bar", log_scale=False): 10 """ 11 Plots the frequency of different values of a column across the dataset 12 13 Parameters 14 ---------- 15 df : pd.DataFrame 16 The DataFrame containing the desired data. 17 protocol : str 18 The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover) 19 column : str 20 The name of the column that the graph will plot. 21 title : str 22 The title of the graph 23 plot_type : str, default="bar" 24 The type of graph that will be used to visually compare the data. 25 log_scale : bool, default=False 26 If a log scale will be used for the data. If true, all the values will be in Log 10. 27 """ 28 col_vals = pd.DataFrame() 29 col_vals["Count"] = df.groupby(column).size() 30 col_vals.reset_index(inplace=True) 31 col_vals.columns = ["index", "Count"] 32 if len(col_vals) <= 1: # pragma: no cover 33 logging.warning( 34 f"There is only one value for this column: {col_vals['index'][0]}: {col_vals['Count'][0]}" 35 ) 36 plt.figure(figsize=(10, 6)) 37 ylabel = "Frequency" 38 title = f"{protocol} - {title}" 39 if log_scale: 40 col_vals["Count"] = pd.Series([math.log10(val) for val in col_vals["Count"]]) 41 ylabel += " (Log Scale)" 42 title += " (Log Scale)" 43 if plot_type == "line": # pragma: no cover 44 plt.plot(col_vals["index"], col_vals["Count"], color="lightblue") 45 else: 46 plt.bar(col_vals["index"], col_vals["Count"], color="lightblue") 47 48 plt.title(title) 49 plt.xlim(left=-0.5) 50 plt.xlabel(f"{column} Values") 51 plt.ylabel(ylabel) 52 53 54def multiple_bar_graph(df, protocol, cols, title, log_scale=False): 55 """ 56 Plots the frequency of different values of a column across the dataset alongside eachother. 57 58 Parameters 59 ---------- 60 df : pd.DataFrame 61 The DataFrame containing the desired data. 62 protocol : str 63 The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover) 64 cols : list str 65 The names of the columns that the graph will plot. 66 title : str 67 The title of the graph 68 log_scale : bool, default=False 69 If a log scale will be used for the data. If true, all the values will be in Log 10. 70 """ 71 72 def create_summary_df(cols): 73 """ 74 data = df 75 x = [0,6] 76 hue = photoType: photocount, rejectedcount, etc 77 """ 78 photo_summary = pd.DataFrame() 79 for name in cols: 80 photo_summary[name] = df[name].value_counts() 81 # print(type(photo_summary[name])) 82 photo_summary.sort_index(inplace=True) 83 photo_summary = photo_summary.reset_index() 84 # print(photo_summary) 85 86 category_counts = [] 87 88 for name in cols: 89 for j in range(len(photo_summary)): 90 count = photo_summary[name][j] 91 if log_scale: 92 count = math.log10(count) 93 new_row = { 94 "index": photo_summary["index"][j], 95 "category": name, 96 "count": count, 97 } 98 category_counts.append(new_row) 99 category_counts = pd.DataFrame(category_counts) 100 return category_counts 101 102 category_counts = create_summary_df(cols) 103 plt.figure(figsize=(10, 6)) 104 105 title = f"{protocol} -- {title}" 106 ylabel = "Frequency" 107 if log_scale: 108 ylabel += " (Log Scale)" 109 title += " (Log Scale)" 110 111 ax = sns.barplot( 112 data=category_counts, 113 x="index", 114 y="count", 115 hue="category", 116 palette=[ 117 "#377eb8", 118 "#ff7f00", 119 "#4daf4a", 120 "#f781bf", 121 "#a65628", 122 "#984ea3", 123 "#999999", 124 "#e41a1c", 125 "#dede00", 126 ], 127 ) 128 ax.set_xlabel("Photo Count") 129 ax.set_ylabel(ylabel) 130 131 ax.set_title(title) 132 plt.legend(loc="upper right") 133 134 135def plot_int_distribution(df, col_name, title_name): 136 """ 137 Plots the frequency of different integer values of a column across a cleaned dataset 138 139 Parameters 140 ---------- 141 df : pd.DataFrame 142 The DataFrame containing the desired data. 143 col_name : str 144 The name of the column that the graph will plot. 145 title_name : str 146 The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title) 147 """ 148 df = df.copy() 149 df[col_name] = df[col_name].replace(-9999, -5) 150 151 counts = df.groupby(col_name).size() 152 title = f"{title_name} Distribution (with Null)" 153 plt.figure(figsize=(10, 5)) 154 plt.title(title) 155 plt.ylabel(f"{title_name} Entries (Log Scale)") 156 plt.yscale("log") 157 plt.bar(counts.index, counts, color="#b30000") 158 159 160def completeness_histogram(df, protocol, completeness_col, completeness_type): 161 """ 162 Plots a histogram of the completeness score distribution. 163 164 Parameters 165 ---------- 166 df : pd.DataFrame 167 The DataFrame containing the desired data. 168 protocol : str 169 The name of the protocol that the graph will plot. 170 completeness_col : str 171 The column containing the desired completeness metric 172 completness_type : str 173 The type of completeness score (Sub or Cumulative) 174 """ 175 plt.figure(figsize=(10, 4)) 176 title = f"{protocol} -- {completeness_type} Completeness Scores Frequency Histogram" 177 plt.title(title) 178 plt.hist(df[completeness_col], color="pink", label=completeness_type) 179 plt.xlabel("Scores") 180 plt.ylabel("Count") 181 182 183def save_stored_plots(): 184 """ 185 Saves any generated graphs currently stored by the pyplot object. 186 """ 187 for num in plt.get_fignums(): 188 plt.figure(num) 189 title = plt.gca().get_title() 190 plt.savefig(f"{title}.png")
def
plot_freq_bar(df, protocol, column, title, plot_type='bar', log_scale=False)
10def plot_freq_bar(df, protocol, column, title, plot_type="bar", log_scale=False): 11 """ 12 Plots the frequency of different values of a column across the dataset 13 14 Parameters 15 ---------- 16 df : pd.DataFrame 17 The DataFrame containing the desired data. 18 protocol : str 19 The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover) 20 column : str 21 The name of the column that the graph will plot. 22 title : str 23 The title of the graph 24 plot_type : str, default="bar" 25 The type of graph that will be used to visually compare the data. 26 log_scale : bool, default=False 27 If a log scale will be used for the data. If true, all the values will be in Log 10. 28 """ 29 col_vals = pd.DataFrame() 30 col_vals["Count"] = df.groupby(column).size() 31 col_vals.reset_index(inplace=True) 32 col_vals.columns = ["index", "Count"] 33 if len(col_vals) <= 1: # pragma: no cover 34 logging.warning( 35 f"There is only one value for this column: {col_vals['index'][0]}: {col_vals['Count'][0]}" 36 ) 37 plt.figure(figsize=(10, 6)) 38 ylabel = "Frequency" 39 title = f"{protocol} - {title}" 40 if log_scale: 41 col_vals["Count"] = pd.Series([math.log10(val) for val in col_vals["Count"]]) 42 ylabel += " (Log Scale)" 43 title += " (Log Scale)" 44 if plot_type == "line": # pragma: no cover 45 plt.plot(col_vals["index"], col_vals["Count"], color="lightblue") 46 else: 47 plt.bar(col_vals["index"], col_vals["Count"], color="lightblue") 48 49 plt.title(title) 50 plt.xlim(left=-0.5) 51 plt.xlabel(f"{column} Values") 52 plt.ylabel(ylabel)
Plots the frequency of different values of a column across the dataset
Parameters
- df (pd.DataFrame): The DataFrame containing the desired data.
- protocol (str): The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
- column (str): The name of the column that the graph will plot.
- title (str): The title of the graph
- plot_type (str, default="bar"): The type of graph that will be used to visually compare the data.
- log_scale (bool, default=False): If a log scale will be used for the data. If true, all the values will be in Log 10.
def
multiple_bar_graph(df, protocol, cols, title, log_scale=False)
55def multiple_bar_graph(df, protocol, cols, title, log_scale=False): 56 """ 57 Plots the frequency of different values of a column across the dataset alongside eachother. 58 59 Parameters 60 ---------- 61 df : pd.DataFrame 62 The DataFrame containing the desired data. 63 protocol : str 64 The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover) 65 cols : list str 66 The names of the columns that the graph will plot. 67 title : str 68 The title of the graph 69 log_scale : bool, default=False 70 If a log scale will be used for the data. If true, all the values will be in Log 10. 71 """ 72 73 def create_summary_df(cols): 74 """ 75 data = df 76 x = [0,6] 77 hue = photoType: photocount, rejectedcount, etc 78 """ 79 photo_summary = pd.DataFrame() 80 for name in cols: 81 photo_summary[name] = df[name].value_counts() 82 # print(type(photo_summary[name])) 83 photo_summary.sort_index(inplace=True) 84 photo_summary = photo_summary.reset_index() 85 # print(photo_summary) 86 87 category_counts = [] 88 89 for name in cols: 90 for j in range(len(photo_summary)): 91 count = photo_summary[name][j] 92 if log_scale: 93 count = math.log10(count) 94 new_row = { 95 "index": photo_summary["index"][j], 96 "category": name, 97 "count": count, 98 } 99 category_counts.append(new_row) 100 category_counts = pd.DataFrame(category_counts) 101 return category_counts 102 103 category_counts = create_summary_df(cols) 104 plt.figure(figsize=(10, 6)) 105 106 title = f"{protocol} -- {title}" 107 ylabel = "Frequency" 108 if log_scale: 109 ylabel += " (Log Scale)" 110 title += " (Log Scale)" 111 112 ax = sns.barplot( 113 data=category_counts, 114 x="index", 115 y="count", 116 hue="category", 117 palette=[ 118 "#377eb8", 119 "#ff7f00", 120 "#4daf4a", 121 "#f781bf", 122 "#a65628", 123 "#984ea3", 124 "#999999", 125 "#e41a1c", 126 "#dede00", 127 ], 128 ) 129 ax.set_xlabel("Photo Count") 130 ax.set_ylabel(ylabel) 131 132 ax.set_title(title) 133 plt.legend(loc="upper right")
Plots the frequency of different values of a column across the dataset alongside eachother.
Parameters
- df (pd.DataFrame): The DataFrame containing the desired data.
- protocol (str): The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
- cols (list str): The names of the columns that the graph will plot.
- title (str): The title of the graph
- log_scale (bool, default=False): If a log scale will be used for the data. If true, all the values will be in Log 10.
def
plot_int_distribution(df, col_name, title_name)
136def plot_int_distribution(df, col_name, title_name): 137 """ 138 Plots the frequency of different integer values of a column across a cleaned dataset 139 140 Parameters 141 ---------- 142 df : pd.DataFrame 143 The DataFrame containing the desired data. 144 col_name : str 145 The name of the column that the graph will plot. 146 title_name : str 147 The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title) 148 """ 149 df = df.copy() 150 df[col_name] = df[col_name].replace(-9999, -5) 151 152 counts = df.groupby(col_name).size() 153 title = f"{title_name} Distribution (with Null)" 154 plt.figure(figsize=(10, 5)) 155 plt.title(title) 156 plt.ylabel(f"{title_name} Entries (Log Scale)") 157 plt.yscale("log") 158 plt.bar(counts.index, counts, color="#b30000")
Plots the frequency of different integer values of a column across a cleaned dataset
Parameters
- df (pd.DataFrame): The DataFrame containing the desired data.
- col_name (str): The name of the column that the graph will plot.
- title_name (str): The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title)
def
completeness_histogram(df, protocol, completeness_col, completeness_type)
161def completeness_histogram(df, protocol, completeness_col, completeness_type): 162 """ 163 Plots a histogram of the completeness score distribution. 164 165 Parameters 166 ---------- 167 df : pd.DataFrame 168 The DataFrame containing the desired data. 169 protocol : str 170 The name of the protocol that the graph will plot. 171 completeness_col : str 172 The column containing the desired completeness metric 173 completness_type : str 174 The type of completeness score (Sub or Cumulative) 175 """ 176 plt.figure(figsize=(10, 4)) 177 title = f"{protocol} -- {completeness_type} Completeness Scores Frequency Histogram" 178 plt.title(title) 179 plt.hist(df[completeness_col], color="pink", label=completeness_type) 180 plt.xlabel("Scores") 181 plt.ylabel("Count")
Plots a histogram of the completeness score distribution.
Parameters
- df (pd.DataFrame): The DataFrame containing the desired data.
- protocol (str): The name of the protocol that the graph will plot.
- completeness_col (str): The column containing the desired completeness metric
- completness_type (str): The type of completeness score (Sub or Cumulative)
def
save_stored_plots()
184def save_stored_plots(): 185 """ 186 Saves any generated graphs currently stored by the pyplot object. 187 """ 188 for num in plt.get_fignums(): 189 plt.figure(num) 190 title = plt.gca().get_title() 191 plt.savefig(f"{title}.png")
Saves any generated graphs currently stored by the pyplot object.