import numpy as np
import pandas as pd
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors
import seaborn as sns
from pyPMF import readers, plotter, get_sourcesCategories, add_season
[docs]class PMF(object):
"""PMF output of the US EPA PMF5.0 software in handy format (pandas DataFrame).
Several utilities and plots are also available.
"""
def __init__(self, site, reader=None, savedir="./", BDIR=None, SQL_connection=None,
SQL_table_names=None, SQL_program=None):
"""PMF object from output of EPAPMF5.
Parameters
----------
site : str, the name of the site (prefix of each files if outputed in xlsx)
reader : str, 'xlsx' or 'sql'
Format of the saved output of the PMF
- xlsx : saved as xlsx output. Need to specify also BDIR
- sql : name of the SQL database. Need to specify also SQL_connection,
SQL_program and SQL_table_names
savedir : str, default current path
Path to directory to save the figures
BDIR : str, the directory where the xlsx files live, if outputed in xlsx
SQL_connection : SQL connection to a existing database
SQL_program : str, optional
If the database contains a "Program" column, specify the program wanted
SQL_table_names : dict, mapping of PMF output and table name in the SQL database
"""
self._site = site
if reader == "xlsx":
self.read = readers.XlsxReader(BDIR=BDIR, site=site, pmf=self)
elif reader == "sql":
self.read = readers.SqlReader(
site=site,
pmf=self,
SQL_program=SQL_program, SQL_connection=SQL_connection, SQL_table_names=SQL_table_names
)
self.plot = plotter.Plotter(pmf=self, savedir=savedir)
self.profiles = None
self.nprofiles = None
self.species = None
self.nspecies = None
self.totalVar = None
self.dfprofiles_b = None
self.dfcontrib_b = None
self.dfprofiles_c = None
self.dfcontrib_c = None
self.dfBS_profile_b = None
self.dfBS_profile_c = None
self.dfbootstrap_mapping_b = None
self.dfbootstrap_mapping_c = None
self.df_disp_swap_b = None
self.df_disp_swap_c = None
self.df_uncertainties_summary_b = None
self.df_uncertainties_summary_c = None
[docs] def to_cubic_meter(self, specie=None, constrained=True, profiles=None):
"""Convert the contribution in cubic meter for the given specie
Parameters
----------
constrained : Boolean, default True
specie : str, the specie, default totalVar
profiles : list of profile, default all profiles
Return
------
df : pd.DataFrame
"""
if specie is None:
specie = self.totalVar
if profiles is None:
profiles = self.profiles
if constrained:
df = self.dfcontrib_c
dfprofiles = self.dfprofiles_c
else:
df = self.dfcontrib_b
dfprofiles = self.dfprofiles_b
contrib = pd.DataFrame(index=df.index, columns=profiles)
for profile in profiles:
contrib[profile] = df[profile] * dfprofiles.loc[specie, profile]
return contrib
[docs] def to_relative_mass(self, constrained=True, species=None, profiles=None):
"""Compute the factor profile relative mass (i.e. each species divided
by the totalVar mass)
Parameters
----------
constrained : Boolean, default True
species : list of str, default all species
profiles : list of str, default all profiles
Return
------
df : pd.DataFrame
"""
if constrained:
df = self.dfprofiles_c
else:
df = self.dfprofiles_b
if profiles is None:
profiles = self.profiles
if species is None:
species = self.species
d = df[profiles] / df.loc[self.totalVar, profiles]
return d
[docs] def get_total_specie_sum(self, constrained=True):
"""
Return the total specie sum profiles in %
Parameters
----------
constrained : boolean, default True
use the constrained run or not
Returns
-------
df : pd.DataFrame
The normalized species sum per profiles
"""
if constrained:
df = self.dfprofiles_c.copy()
else:
df = self.dfprofiles_b.copy()
# df = (self.dfprofiles_c.T / self.dfprofiles_c.sum(axis=1)).T * 100
df = (df.T / df.sum(axis=1)).T * 100
return df
[docs] def get_seasonal_contribution(self, specie=None, annual=True,
normalize=True, constrained=True):
"""
Get a dataframe of seasonal contribution
Parameters
----------
specie : str, default to total variable
annual : Boolean, default True, add annual contribution
normalize : Boolean, default True, normalize to 100%
constrained : Boolean, default True
Return
------
df : pd.DataFrame
seasonal contribution
"""
if constrained:
if self.dfprofiles_c is None:
self.read.read_constrained_profiles()
if self.dfcontrib_c is None:
self.read.read_constrained_contributions()
dfprofiles = self.dfprofiles_c
dfcontrib = self.dfcontrib_c
else:
if self.dfprofiles_b is None:
self.read.read_base_profiles()
if self.dfcontrib_b is None:
self.read.read_base_contributions()
dfprofiles = self.dfprofiles_b
dfcontrib = self.dfcontrib_b
if specie is None:
if self.totalVar is None:
self.read.read_metadata()
specie = self.totalVar
dfcontribSeason = (dfprofiles.loc[specie] * dfcontrib).sort_index(axis=1)
ordered_season = ["Winter", "Spring", "Summer", "Fall"]
if annual:
ordered_season.append("Annual")
dfcontribSeason = add_season(dfcontribSeason, month=False)\
.infer_objects()
dfcontribSeason = dfcontribSeason.groupby("season")
if normalize:
df = (dfcontribSeason.sum().T / dfcontribSeason.sum().sum(axis=1))
df = df.T
else:
df = dfcontribSeason.mean()
if annual:
df.loc["Annual", :] = df.mean()
df = df.reindex(ordered_season)
return df
[docs] def replace_totalVar(self, newTotalVar):
"""replace the total var to all dataframe
Parameters
----------
newTotalVar : str
"""
DF = [
self.dfprofiles_b,
self.dfprofiles_c,
self.dfBS_profile_b,
self.dfBS_profile_c,
self.df_uncertainties_summary_b,
self.df_uncertainties_summary_c,
]
for df in DF:
if df is None:
continue
df.rename({self.totalVar: newTotalVar}, inplace=True, axis=0)
self.species = [newTotalVar if x == self.totalVar else x for x in self.species]
self.totalVar = newTotalVar
[docs] def rename_factors(self, mapper):
"""Rename factors names in all dataframe
Parameters
----------
mapper : dict
Key of the dictionnary are the old name, and value the desired name
"""
DF = [
self.dfprofiles_b,
self.dfprofiles_c,
self.dfcontrib_b,
self.dfcontrib_c,
self.dfBS_profile_b,
self.dfBS_profile_c,
self.df_uncertainties_summary_b,
self.df_uncertainties_summary_c,
]
for df in DF:
if df is None:
continue
if df.index.dtype == 'O':
df.rename(mapper, inplace=True, axis="index")
if df.columns.dtype == 'O':
df.rename(mapper, inplace=True, axis="columns")
self.profiles = [mapper.get(p, p) for p in self.profiles]
[docs] def rename_factors_to_factors_category(self):
"""Rename the factor profile name to match the category
See pyPMF.utils.get_sourcesCategories
"""
possible_sources = {p: get_sourcesCategories([p])[0] for p in self.profiles}
self.rename_factors(possible_sources)
[docs] def recompute_new_species(self, specie):
"""Recompute a specie given the other species. For instance, recompute OC
from OC* and a list of organic species.
It modify inplace both dfprofile_b and dfprofile_c, and update
self.species.
Parameters
----------
specie : str in ["OC",]
"""
knownSpecies = ["OC"]
if specie not in knownSpecies:
return
equivC = {
'Oxalate': 0.27,
'Arabitol': 0.40,
'Mannitol': 0.40,
'Sorbitol': 0.40,
'Polyols': 0.40,
'Levoglucosan': 0.44,
'Mannosan': 0.44,
'Galactosan': 0.44,
'MSA': 0.12,
'Glucose': 0.44,
'Cellulose': 0.44,
'Maleic': 0.41,
'Succinic': 0.41,
'Citraconic': 0.46,
'Glutaric': 0.45,
'Oxoheptanedioic': 0.48,
'MethylSuccinic': 0.53,
'Adipic': 0.49,
'Methylglutaric': 0.49,
'3-MBTCA': 0.47,
'Phtalic': 0.58,
'Pinic': 0.58,
'Suberic': 0.55,
'Azelaic': 0.57,
'Sebacic': 0.59,
}
if specie == "OC":
if specie not in self.species:
self.species.append(specie)
OCb = self.dfprofiles_b.loc["OC*"].copy()
OCc = self.dfprofiles_c.loc["OC*"].copy()
for sp in equivC.keys():
if sp in self.species:
OCb += (self.dfprofiles_b.loc[sp] * equivC[sp]).infer_objects()
OCc += (self.dfprofiles_c.loc[sp] * equivC[sp]).infer_objects()
self.dfprofiles_b.loc[specie] = OCb.infer_objects()
self.dfprofiles_c.loc[specie] = OCc.infer_objects()
[docs] def print_uncertainties_summary(self, constrained=True, profiles=None,
species=None):
"""Get the uncertainties given by BS, BS-DISP and DISP for the given profiles and
species
Parameters
----------
constrained : boolean, True
Use the constrained run (False for the base run)
profiles : list of str
list of profiles, default all profiles
species : list of str
list of species, default all species
Return
------
df : pd.DataFrame
BS, DISP and BS-DISP ranges
"""
if constrained:
if self.df_uncertainties_summary_c is None:
self.read.read_constrained_uncertainties_summary()
df = self.df_uncertainties_summary_c
else:
if self.df_uncertainties_summary_b is None:
self.read.read_base_uncertainties_summary()
df = self.df_uncertainties_summary_b
if profiles is None:
if self.profiles is None:
self.read.read_metadata()
profiles = self.profiles
if species is None:
if self.species is None:
self.read.read_metadata()
species = self.species
return df.T.loc[:, (profiles, species)]