diff --git a/malariagen_data/__init__.py b/malariagen_data/__init__.py index dfa555aa7..e54844a25 100644 --- a/malariagen_data/__init__.py +++ b/malariagen_data/__init__.py @@ -2,6 +2,7 @@ from .af1 import Af1 from .ag3 import Ag3 from .amin1 import Amin1 +from .as1 import As1 from .anopheles import AnophelesDataResource, Region from .pf7 import Pf7 from .pf8 import Pf8 diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py index 0c1744926..6393a3e12 100644 --- a/malariagen_data/anoph/base.py +++ b/malariagen_data/anoph/base.py @@ -437,12 +437,25 @@ def sample_sets( # Ensure no duplicates. releases = sorted(set(release)) - # Retrieve and concatenate sample sets from multiple releases. - df = pd.concat( - [self.sample_sets(release=r) for r in releases], - axis=0, - ignore_index=True, - ) + # Add each release's sample sets DataFrame to a list. + sample_set_dfs = [] + for release in releases: + # Get the sample sets for this release. + release_sample_sets_df = self.sample_sets(release=release) + + # Only include if there are sample sets for this release. + if ( + release_sample_sets_df is not None + and not release_sample_sets_df.empty + ): + sample_set_dfs.append(release_sample_sets_df) + + if sample_set_dfs: + # Concatenate the sample sets DataFrames from multiple releases. + df = pd.concat(sample_set_dfs, axis=0, ignore_index=True) + else: + # If there are no sample sets for this release, return an empty DataFrame. + df = pd.DataFrame() else: raise TypeError diff --git a/malariagen_data/as1.py b/malariagen_data/as1.py new file mode 100644 index 000000000..7dd259d09 --- /dev/null +++ b/malariagen_data/as1.py @@ -0,0 +1,210 @@ +import sys + +import plotly.express as px # type: ignore + +import malariagen_data +from .anopheles import AnophelesDataResource + +MAJOR_VERSION_NUMBER = 1 +MAJOR_VERSION_PATH = "v1.0" +CONFIG_PATH = "v1.0-config.json" +GCS_DEFAULT_URL = "gs://vo_aste_release_master_us_central1/" +GCS_DEFAULT_PUBLIC_URL = "gs://vo_aste_temp_us_central1/vo_aste_release/" +GCS_REGION_URLS = { + "us-central1": "gs://vo_aste_release_master_us_central1", +} +XPEHH_GWSS_CACHE_NAME = "as1_xpehh_gwss_v1" +IHS_GWSS_CACHE_NAME = "as1_ihs_gwss_v1" + +TAXON_PALETTE = px.colors.qualitative.Plotly +TAXON_COLORS = { + "stephensi": TAXON_PALETTE[0], +} + + +class As1(AnophelesDataResource): + """Provides access to data from As1.x releases. + + Parameters + ---------- + url : str, optional + Base path to data. Defaults to use Google Cloud Storage, or can + be a local path on your file system if data have been downloaded. + site_filters_analysis : str, optional + Site filters analysis version. + bokeh_output_notebook : bool, optional + If True (default), configure bokeh to output plots to the notebook. + results_cache : str, optional + Path to directory on local file system to save results. + log : str or stream, optional + File path or stream output for logging messages. + debug : bool, optional + Set to True to enable debug level logging. + show_progress : bool, optional + If True, show a progress bar during longer-running computations. The default can be overridden using an environmental variable named MGEN_SHOW_PROGRESS. + check_location : bool, optional + If True, use ipinfo to check the location of the client system. + **kwargs + Passed through to fsspec when setting up file system access. + + Examples + -------- + Access data from Google Cloud Storage (default): + + >>> import malariagen_data + >>> as1 = malariagen_data.As1() + + Access data downloaded to a local file system: + + >>> as1 = malariagen_data.As1("/local/path/to/vo_aste_release/") + + Access data from Google Cloud Storage, with caching on the local file system + in a directory named "gcs_cache": + + >>> as1 = malariagen_data.As1( + ... "simplecache::gs://vo_aste_release_master_us_central1", + ... simplecache=dict(cache_storage="gcs_cache"), + ... ) + + Set up caching of some longer-running computations on the local file system, + in a directory named "results_cache": + + >>> as1 = malariagen_data.As1(results_cache="results_cache") + + """ + + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME + _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME + + def __init__( + self, + url=None, + public_url=GCS_DEFAULT_PUBLIC_URL, + bokeh_output_notebook=True, + results_cache=None, + log=sys.stdout, + debug=False, + show_progress=None, + check_location=True, + cohorts_analysis=None, + site_filters_analysis=None, + discordant_read_calls_analysis=None, + pre=False, + tqdm_class=None, + **storage_options, # used by fsspec via init_filesystem() + ): + super().__init__( + url=url, + public_url=public_url, + config_path=CONFIG_PATH, + cohorts_analysis=cohorts_analysis, + aim_analysis=None, + aim_metadata_dtype=None, + aim_ids=None, + aim_palettes=None, + site_filters_analysis=site_filters_analysis, + discordant_read_calls_analysis=discordant_read_calls_analysis, + default_site_mask="stephensi", + default_phasing_analysis="stephensi", + default_coverage_calls_analysis="stephensi", + bokeh_output_notebook=bokeh_output_notebook, + results_cache=results_cache, + log=log, + debug=debug, + show_progress=show_progress, + check_location=check_location, + pre=pre, + gcs_default_url=GCS_DEFAULT_URL, + gcs_region_urls=GCS_REGION_URLS, + major_version_number=MAJOR_VERSION_NUMBER, + major_version_path=MAJOR_VERSION_PATH, + gff_gene_type="protein_coding_gene", + gff_gene_name_attribute="Note", + gff_default_attributes=("ID", "Parent", "Note", "description"), + storage_options=storage_options, # used by fsspec via init_filesystem() + tqdm_class=tqdm_class, + taxon_colors=TAXON_COLORS, + virtual_contigs=None, + gene_names=None, + inversion_tag_path=None, + ) + + def __repr__(self): + text = ( + f"\n" + f"Storage URL : {self._url}\n" + f"Data releases available : {', '.join(self.releases)}\n" + f"Results cache : {self._results_cache}\n" + f"Cohorts analysis : {self._cohorts_analysis}\n" + f"Site filters analysis : {self._site_filters_analysis}\n" + f"Software version : malariagen_data {malariagen_data.__version__}\n" + f"Client location : {self.client_location}\n" + f"---\n" + f"Please note that data are subject to terms of use,\n" + f"for more information see https://www.malariagen.net/data\n" + f"or contact support@malariagen.net. For API documentation see \n" + f"https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/As1.html" + ) + return text + + def _repr_html_(self): + html = f""" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MalariaGEN As1 API client
+ Please note that data are subject to terms of use, + for more information see + the MalariaGEN website or contact support@malariagen.net. + See also the As1 API docs. +
+ Storage URL + {self._url}
+ Data releases available + {', '.join(self.releases)}
+ Results cache + {self._results_cache}
+ Cohorts analysis + {self._cohorts_analysis}
+ Site filters analysis + {self._site_filters_analysis}
+ Software version + malariagen_data {malariagen_data.__version__}
+ Client location + {self.client_location}
+ """ + return html diff --git a/tests/integration/test_as1.py b/tests/integration/test_as1.py new file mode 100644 index 000000000..eee14fd84 --- /dev/null +++ b/tests/integration/test_as1.py @@ -0,0 +1,22 @@ +from malariagen_data import As1 + + +def setup_as1(url="simplecache::gs://vo_aste_release_master_us_central1/", **kwargs): + kwargs.setdefault("check_location", False) + kwargs.setdefault("show_progress", False) + if url is None: + # Test the default URL. + # Note: This only tests the setup_as1 default URL, not the As1 default. + # The test_anopheles setup_subclass tests the true defaults. + return As1(**kwargs) + if url.startswith("simplecache::"): + # Configure the directory on the local file system to cache data. + kwargs["simplecache"] = dict(cache_storage="gcs_cache") + return As1(url, **kwargs) + + +def test_repr(): + as1 = setup_as1(check_location=True) + assert isinstance(as1, As1) + r = repr(as1) + assert isinstance(r, str)