go_utils.plot

View Source

  1import logging
  2import math
  3
  4import matplotlib.pyplot as plt
  5import pandas as pd
  6import seaborn as sns
  7
  8
  9def plot_freq_bar(df, protocol, column, title, plot_type="bar", log_scale=False):
 10    """
 11    Plots the frequency of different values of a column across the dataset
 12
 13    Parameters
 14    ----------
 15    df : pd.DataFrame
 16      The DataFrame containing the desired data.
 17    protocol : str
 18      The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
 19    column : str
 20      The name of the column that the graph will plot.
 21    title : str
 22      The title of the graph
 23    plot_type : str, default="bar"
 24      The type of graph that will be used to visually compare the data.
 25    log_scale : bool, default=False
 26      If a log scale will be used for the data. If true, all the values will be in Log 10.
 27    """
 28    col_vals = pd.DataFrame()
 29    col_vals["Count"] = df.groupby(column).size()
 30    col_vals.reset_index(inplace=True)
 31    col_vals.columns = ["index", "Count"]
 32    if len(col_vals) <= 1:  # pragma: no cover
 33        logging.warning(
 34            f"There is only one value for this column: {col_vals['index'][0]}: {col_vals['Count'][0]}"
 35        )
 36    plt.figure(figsize=(10, 6))
 37    ylabel = "Frequency"
 38    title = f"{protocol} - {title}"
 39    if log_scale:
 40        col_vals["Count"] = pd.Series([math.log10(val) for val in col_vals["Count"]])
 41        ylabel += " (Log Scale)"
 42        title += " (Log Scale)"
 43    if plot_type == "line":  # pragma: no cover
 44        plt.plot(col_vals["index"], col_vals["Count"], color="lightblue")
 45    else:
 46        plt.bar(col_vals["index"], col_vals["Count"], color="lightblue")
 47
 48    plt.title(title)
 49    plt.xlim(left=-0.5)
 50    plt.xlabel(f"{column} Values")
 51    plt.ylabel(ylabel)
 52
 53
 54def multiple_bar_graph(df, protocol, cols, title, log_scale=False):
 55    """
 56    Plots the frequency of different values of a column across the dataset alongside eachother.
 57
 58    Parameters
 59    ----------
 60    df : pd.DataFrame
 61      The DataFrame containing the desired data.
 62    protocol : str
 63      The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
 64    cols : list str
 65      The names of the columns that the graph will plot.
 66    title : str
 67      The title of the graph
 68    log_scale : bool, default=False
 69      If a log scale will be used for the data. If true, all the values will be in Log 10.
 70    """
 71
 72    def create_summary_df(cols):
 73        """
 74        data = df
 75        x = [0,6]
 76        hue = photoType: photocount, rejectedcount, etc
 77        """
 78        photo_summary = pd.DataFrame()
 79        for name in cols:
 80            photo_summary[name] = df[name].value_counts()
 81            # print(type(photo_summary[name]))
 82        photo_summary.sort_index(inplace=True)
 83        photo_summary = photo_summary.reset_index()
 84        # print(photo_summary)
 85
 86        category_counts = []
 87
 88        for name in cols:
 89            for j in range(len(photo_summary)):
 90                count = photo_summary[name][j]
 91                if log_scale:
 92                    count = math.log10(count)
 93                new_row = {
 94                    "index": photo_summary["index"][j],
 95                    "category": name,
 96                    "count": count,
 97                }
 98                category_counts.append(new_row)
 99        category_counts = pd.DataFrame(category_counts)
100        return category_counts
101
102    category_counts = create_summary_df(cols)
103    plt.figure(figsize=(10, 6))
104
105    title = f"{protocol} -- {title}"
106    ylabel = "Frequency"
107    if log_scale:
108        ylabel += " (Log Scale)"
109        title += " (Log Scale)"
110
111    ax = sns.barplot(
112        data=category_counts,
113        x="index",
114        y="count",
115        hue="category",
116        palette=[
117            "#377eb8",
118            "#ff7f00",
119            "#4daf4a",
120            "#f781bf",
121            "#a65628",
122            "#984ea3",
123            "#999999",
124            "#e41a1c",
125            "#dede00",
126        ],
127    )
128    ax.set_xlabel("Photo Count")
129    ax.set_ylabel(ylabel)
130
131    ax.set_title(title)
132    plt.legend(loc="upper right")
133
134
135def plot_int_distribution(df, col_name, title_name):
136    """
137    Plots the frequency of different integer values of a column across a cleaned dataset
138
139    Parameters
140    ----------
141    df : pd.DataFrame
142      The DataFrame containing the desired data.
143    col_name : str
144      The name of the column that the graph will plot.
145    title_name : str
146      The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title)
147    """
148    df = df.copy()
149    df[col_name] = df[col_name].replace(-9999, -5)
150
151    counts = df.groupby(col_name).size()
152    title = f"{title_name} Distribution (with Null)"
153    plt.figure(figsize=(10, 5))
154    plt.title(title)
155    plt.ylabel(f"{title_name} Entries  (Log Scale)")
156    plt.yscale("log")
157    plt.bar(counts.index, counts, color="#b30000")
158
159
160def completeness_histogram(df, protocol, completeness_col, completeness_type):
161    """
162    Plots a histogram of the completeness score distribution.
163
164    Parameters
165    ----------
166    df : pd.DataFrame
167      The DataFrame containing the desired data.
168    protocol : str
169      The name of the protocol that the graph will plot.
170    completeness_col : str
171      The column containing the desired completeness metric
172    completness_type : str
173      The type of completeness score (Sub or Cumulative)
174    """
175    plt.figure(figsize=(10, 4))
176    title = f"{protocol} -- {completeness_type} Completeness Scores Frequency Histogram"
177    plt.title(title)
178    plt.hist(df[completeness_col], color="pink", label=completeness_type)
179    plt.xlabel("Scores")
180    plt.ylabel("Count")
181
182
183def save_stored_plots():
184    """
185    Saves any generated graphs currently stored by the pyplot object.
186    """
187    for num in plt.get_fignums():
188        plt.figure(num)
189        title = plt.gca().get_title()
190        plt.savefig(f"{title}.png")

def plot_freq_bar(df, protocol, column, title, plot_type='bar', log_scale=False) View Source

10def plot_freq_bar(df, protocol, column, title, plot_type="bar", log_scale=False):
11    """
12    Plots the frequency of different values of a column across the dataset
13
14    Parameters
15    ----------
16    df : pd.DataFrame
17      The DataFrame containing the desired data.
18    protocol : str
19      The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
20    column : str
21      The name of the column that the graph will plot.
22    title : str
23      The title of the graph
24    plot_type : str, default="bar"
25      The type of graph that will be used to visually compare the data.
26    log_scale : bool, default=False
27      If a log scale will be used for the data. If true, all the values will be in Log 10.
28    """
29    col_vals = pd.DataFrame()
30    col_vals["Count"] = df.groupby(column).size()
31    col_vals.reset_index(inplace=True)
32    col_vals.columns = ["index", "Count"]
33    if len(col_vals) <= 1:  # pragma: no cover
34        logging.warning(
35            f"There is only one value for this column: {col_vals['index'][0]}: {col_vals['Count'][0]}"
36        )
37    plt.figure(figsize=(10, 6))
38    ylabel = "Frequency"
39    title = f"{protocol} - {title}"
40    if log_scale:
41        col_vals["Count"] = pd.Series([math.log10(val) for val in col_vals["Count"]])
42        ylabel += " (Log Scale)"
43        title += " (Log Scale)"
44    if plot_type == "line":  # pragma: no cover
45        plt.plot(col_vals["index"], col_vals["Count"], color="lightblue")
46    else:
47        plt.bar(col_vals["index"], col_vals["Count"], color="lightblue")
48
49    plt.title(title)
50    plt.xlim(left=-0.5)
51    plt.xlabel(f"{column} Values")
52    plt.ylabel(ylabel)

Plots the frequency of different values of a column across the dataset

Parameters

df (pd.DataFrame): The DataFrame containing the desired data.
protocol (str): The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
column (str): The name of the column that the graph will plot.
title (str): The title of the graph
plot_type (str, default="bar"): The type of graph that will be used to visually compare the data.
log_scale (bool, default=False): If a log scale will be used for the data. If true, all the values will be in Log 10.

def multiple_bar_graph(df, protocol, cols, title, log_scale=False) View Source

 55def multiple_bar_graph(df, protocol, cols, title, log_scale=False):
 56    """
 57    Plots the frequency of different values of a column across the dataset alongside eachother.
 58
 59    Parameters
 60    ----------
 61    df : pd.DataFrame
 62      The DataFrame containing the desired data.
 63    protocol : str
 64      The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
 65    cols : list str
 66      The names of the columns that the graph will plot.
 67    title : str
 68      The title of the graph
 69    log_scale : bool, default=False
 70      If a log scale will be used for the data. If true, all the values will be in Log 10.
 71    """
 72
 73    def create_summary_df(cols):
 74        """
 75        data = df
 76        x = [0,6]
 77        hue = photoType: photocount, rejectedcount, etc
 78        """
 79        photo_summary = pd.DataFrame()
 80        for name in cols:
 81            photo_summary[name] = df[name].value_counts()
 82            # print(type(photo_summary[name]))
 83        photo_summary.sort_index(inplace=True)
 84        photo_summary = photo_summary.reset_index()
 85        # print(photo_summary)
 86
 87        category_counts = []
 88
 89        for name in cols:
 90            for j in range(len(photo_summary)):
 91                count = photo_summary[name][j]
 92                if log_scale:
 93                    count = math.log10(count)
 94                new_row = {
 95                    "index": photo_summary["index"][j],
 96                    "category": name,
 97                    "count": count,
 98                }
 99                category_counts.append(new_row)
100        category_counts = pd.DataFrame(category_counts)
101        return category_counts
102
103    category_counts = create_summary_df(cols)
104    plt.figure(figsize=(10, 6))
105
106    title = f"{protocol} -- {title}"
107    ylabel = "Frequency"
108    if log_scale:
109        ylabel += " (Log Scale)"
110        title += " (Log Scale)"
111
112    ax = sns.barplot(
113        data=category_counts,
114        x="index",
115        y="count",
116        hue="category",
117        palette=[
118            "#377eb8",
119            "#ff7f00",
120            "#4daf4a",
121            "#f781bf",
122            "#a65628",
123            "#984ea3",
124            "#999999",
125            "#e41a1c",
126            "#dede00",
127        ],
128    )
129    ax.set_xlabel("Photo Count")
130    ax.set_ylabel(ylabel)
131
132    ax.set_title(title)
133    plt.legend(loc="upper right")

Plots the frequency of different values of a column across the dataset alongside eachother.

Parameters

df (pd.DataFrame): The DataFrame containing the desired data.
protocol (str): The Protocol of the DataFrame (e.g. Mosquito Habitat Mapper or Land Cover)
cols (list str): The names of the columns that the graph will plot.
title (str): The title of the graph
log_scale (bool, default=False): If a log scale will be used for the data. If true, all the values will be in Log 10.

def plot_int_distribution(df, col_name, title_name) View Source

136def plot_int_distribution(df, col_name, title_name):
137    """
138    Plots the frequency of different integer values of a column across a cleaned dataset
139
140    Parameters
141    ----------
142    df : pd.DataFrame
143      The DataFrame containing the desired data.
144    col_name : str
145      The name of the column that the graph will plot.
146    title_name : str
147      The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title)
148    """
149    df = df.copy()
150    df[col_name] = df[col_name].replace(-9999, -5)
151
152    counts = df.groupby(col_name).size()
153    title = f"{title_name} Distribution (with Null)"
154    plt.figure(figsize=(10, 5))
155    plt.title(title)
156    plt.ylabel(f"{title_name} Entries  (Log Scale)")
157    plt.yscale("log")
158    plt.bar(counts.index, counts, color="#b30000")

Plots the frequency of different integer values of a column across a cleaned dataset

Parameters

df (pd.DataFrame): The DataFrame containing the desired data.
col_name (str): The name of the column that the graph will plot.
title_name (str): The name of the column as you would like to have as the title (e.g. mhm_Genus could be just Genus in the title)

def completeness_histogram(df, protocol, completeness_col, completeness_type) View Source

161def completeness_histogram(df, protocol, completeness_col, completeness_type):
162    """
163    Plots a histogram of the completeness score distribution.
164
165    Parameters
166    ----------
167    df : pd.DataFrame
168      The DataFrame containing the desired data.
169    protocol : str
170      The name of the protocol that the graph will plot.
171    completeness_col : str
172      The column containing the desired completeness metric
173    completness_type : str
174      The type of completeness score (Sub or Cumulative)
175    """
176    plt.figure(figsize=(10, 4))
177    title = f"{protocol} -- {completeness_type} Completeness Scores Frequency Histogram"
178    plt.title(title)
179    plt.hist(df[completeness_col], color="pink", label=completeness_type)
180    plt.xlabel("Scores")
181    plt.ylabel("Count")

Plots a histogram of the completeness score distribution.

Parameters

df (pd.DataFrame): The DataFrame containing the desired data.
protocol (str): The name of the protocol that the graph will plot.
completeness_col (str): The column containing the desired completeness metric
completness_type (str): The type of completeness score (Sub or Cumulative)

def save_stored_plots() View Source

184def save_stored_plots():
185    """
186    Saves any generated graphs currently stored by the pyplot object.
187    """
188    for num in plt.get_fignums():
189        plt.figure(num)
190        title = plt.gca().get_title()
191        plt.savefig(f"{title}.png")

Saves any generated graphs currently stored by the pyplot object.