From c2a2afabcdeba6462936404932b8249584136b17 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Thu, 9 Nov 2023 07:16:47 -0500 Subject: [PATCH 1/4] implement faceted search (#845) --- default-sample.cfg | 1 + docs/configuration.rst | 1 + pycsw/core/repository.py | 1 + pycsw/core/util.py | 33 ++++++++++++++++++++++----- pycsw/ogc/api/records.py | 49 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 77 insertions(+), 8 deletions(-) diff --git a/default-sample.cfg b/default-sample.cfg index 2d92c1bfa..ec3113dcd 100644 --- a/default-sample.cfg +++ b/default-sample.cfg @@ -86,6 +86,7 @@ database=sqlite:////var/www/pycsw/tests/functionaltests/suites/cite/data/cite.db table=records #filter=type = 'http://purl.org/dc/dcmitype/Dataset' #max_retries=5 +facets=type,title [metadata:inspire] enabled=true diff --git a/docs/configuration.rst b/docs/configuration.rst index 3120147f1..be4a7e926 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -68,6 +68,7 @@ pycsw's runtime configuration is defined by ``default.cfg``. pycsw ships with a - **source**: the source of this repository only if not local (e.g. :ref:`geonode`, :ref:`odc`). Supported values are ``geonode``, ``odc`` - **filter**: server side database filter to apply as mask to all CSW requests (see :ref:`repofilters`) - **max_retries**: max number of retry attempts when connecting to records-repository database +- **facets**: comma-separated list of facetable properties for search results .. note:: diff --git a/pycsw/core/repository.py b/pycsw/core/repository.py index a4e8f09b3..480d75d47 100644 --- a/pycsw/core/repository.py +++ b/pycsw/core/repository.py @@ -142,6 +142,7 @@ def __init__(self, database, context, app_root=None, table='records', repo_filte self.dbtype = self.engine.name self.session = create_session(self.engine) + self.func = func temp_dbtype = None diff --git a/pycsw/core/util.py b/pycsw/core/util.py index 90be66805..6896d2590 100644 --- a/pycsw/core/util.py +++ b/pycsw/core/util.py @@ -192,6 +192,7 @@ def geojson_geometry2bbox(geometry): return bbox + def wkt2geom(ewkt, bounds=True): """Return Shapely geometry object based on WKT/EWKT @@ -207,8 +208,9 @@ def wkt2geom(ewkt, bounds=True): Returns ------- shapely.geometry.base.BaseGeometry or tuple - Depending on the value of the ``bounds`` parameter, returns either - the shapely geometry instance or a tuple with the bounding box. + + Depending on the value of the ``bounds`` parameter, returns either + the shapely geometry instance or a tuple with the bounding box. References ---------- @@ -346,8 +348,8 @@ def ipaddress_in_whitelist(ipaddress, whitelist): if ip_in_network_cidr(ipaddress, white): return True elif white.find('*') != -1: # subnet wildcard - if ipaddress.startswith(white.split('*')[0]): - return True + if ipaddress.startswith(white.split('*')[0]): + return True return False @@ -372,7 +374,7 @@ def get_anytext_from_obj(obj): """ generate bag of text for free text searches accepts dict, list or string - """ + """ if isinstance(obj, dict): for key, value in obj.items(): @@ -432,6 +434,7 @@ def secure_filename(filename): return filename + def jsonify_links(links): """ pycsw:Links column data handler. @@ -441,7 +444,7 @@ def jsonify_links(links): LOGGER.debug('JSON link') linkset = json.loads(links) return linkset - except json.decoder.JSONDecodeError as err: # try CSV parsing + except json.decoder.JSONDecodeError: # try CSV parsing LOGGER.debug('old style CSV link') json_links = [] for link in links.split('^'): @@ -525,3 +528,21 @@ def load_custom_repo_mappings(repository_mappings: str) -> typing.Optional[typin if imported_mappings_module is not None: result = getattr(imported_mappings_module, "MD_CORE_MODEL", None) return result + + +def str2bool(value: typing.Union[bool, str]) -> bool: + """ + helper function to return Python boolean + type (source: https://stackoverflow.com/a/715468) + :param value: value to be evaluated + :returns: `bool` of whether the value is boolean-ish + """ + + value2 = False + + if isinstance(value, bool): + value2 = value + else: + value2 = value.lower() in ('yes', 'true', 't', '1', 'on') + + return value2 diff --git a/pycsw/ogc/api/records.py b/pycsw/ogc/api/records.py index 65ecdfccd..d6b61f714 100644 --- a/pycsw/ogc/api/records.py +++ b/pycsw/ogc/api/records.py @@ -43,7 +43,7 @@ from pycsw.core.config import StaticContext from pycsw.core.metadata import parse_record from pycsw.core.pygeofilter_evaluate import to_filter -from pycsw.core.util import bind_url, get_today_and_now, jsonify_links, load_custom_repo_mappings, wkt2geom +from pycsw.core.util import bind_url, get_today_and_now, jsonify_links, load_custom_repo_mappings, str2bool, wkt2geom from pycsw.ogc.api.oapi import gen_oapi from pycsw.ogc.api.util import match_env_var, render_j2_template, to_json @@ -101,6 +101,7 @@ def __init__(self, config: ConfigParser): LOGGER.debug(f'Server URL: {url_}') self.config['server']['url'] = url_.rstrip('/') + self.facets = self.config['repository'].get('facets', 'type').split(',') self.context = StaticContext() @@ -511,6 +512,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'): reserved_query_params = [ 'f', + 'facets', 'filter', 'filter-lang', 'limit', @@ -525,6 +527,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'): response = { 'type': 'FeatureCollection', + 'facets': [], 'features': [], 'links': [] } @@ -533,6 +536,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'): query_parser = None sortby = None limit = None + facets_requested = False collections = [] if collection not in self.get_all_collections(): @@ -602,6 +606,8 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'): else: query_args.append(f'{k} = "{v}"') + facets_requested = str2bool(args.get('facets', False)) + if collection != 'metadata:main': LOGGER.debug('Adding virtual collection filter') query_args.append(f'parentidentifier = "{collection}"') @@ -661,8 +667,17 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'): return self.get_exception(400, headers_, 'InvalidParameterValue', msg) query = self.repository.session.query(self.repository.dataset).filter(filters) + if facets_requested: + LOGGER.debug('Running facet query') + facets_results = self.get_facets(filters) else: query = self.repository.session.query(self.repository.dataset) + facets_results = self.get_facets() + + if facets_requested: + response['facets'] = facets_results + else: + response.pop('facets') if 'sortby' in args: LOGGER.debug('sortby specified') @@ -971,7 +986,7 @@ def get_collection_info(self, collection_name: str = 'metadata:main', }] } - def get_all_collections(self): + def get_all_collections(self) -> list: """ Get all collections @@ -983,6 +998,36 @@ def get_all_collections(self): return [default_collection] + [vc.identifier for vc in virtual_collections] + def get_facets(self, filters=None) -> dict: + """ + Gets all facets for a given query + + :returns: `dict` of facets + """ + + facets_results = {} + + for facet in self.facets: + if filters is not None: + facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet).filter(filters).all() + else: + LOGGER.debug('Running facet query') + facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet).all() + + LOGGER.debug('Writing facet query results') + facets_results[facet] = { + 'type': 'terms', + 'property': facet, + 'buckets': [] + } + for fq in facetq: + facets_results[facet]['buckets'].append({ + 'value': fq[0], + 'count': fq[1] + }) + + return facets_results + def record2json(record, url, collection, mode='ogcapi-records'): """ From 4584cf2c160714ebe4674c7aedd0141bb581a392 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Thu, 9 Nov 2023 07:21:51 -0500 Subject: [PATCH 2/4] optimnize facets --- pycsw/ogc/api/records.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pycsw/ogc/api/records.py b/pycsw/ogc/api/records.py index d6b61f714..68cb2fd5f 100644 --- a/pycsw/ogc/api/records.py +++ b/pycsw/ogc/api/records.py @@ -1008,11 +1008,11 @@ def get_facets(self, filters=None) -> dict: facets_results = {} for facet in self.facets: + LOGGER.debug(f'Running facet for {facet}') + facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet) + if filters is not None: - facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet).filter(filters).all() - else: - LOGGER.debug('Running facet query') - facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet).all() + facetq = facetq.filter(filters) LOGGER.debug('Writing facet query results') facets_results[facet] = { @@ -1020,7 +1020,8 @@ def get_facets(self, filters=None) -> dict: 'property': facet, 'buckets': [] } - for fq in facetq: + + for fq in facetq.all(): facets_results[facet]['buckets'].append({ 'value': fq[0], 'count': fq[1] From 51b35c2e00a18efc5425d877ca6605cf41efa424 Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Thu, 9 Nov 2023 21:32:13 +0100 Subject: [PATCH 3/4] an initial implementation of facets ui needs work on url-completion of filter links --- pycsw/ogc/api/templates/items.html | 33 +++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pycsw/ogc/api/templates/items.html b/pycsw/ogc/api/templates/items.html index 574cfc7ae..5f4f8d868 100644 --- a/pycsw/ogc/api/templates/items.html +++ b/pycsw/ogc/api/templates/items.html @@ -24,20 +24,39 @@
-{% set nav_links = namespace(prev=None, next=None) %} +{% set nav_links = namespace(prev=None, next=None, self=None) %} {% for link in data['links'] %} - {% if link['rel'] == 'prev' %} - {% set nav_links.prev = link['href'] %} - {% endif %} - {% if link['rel'] == 'next' %} - {% set nav_links.next = link['href'] %} - {% endif %} +{% if link['rel'] == 'prev' %} +{% set nav_links.prev = link['href'] %} +{% endif %} +{% if link['rel'] == 'self' %} +{% set nav_links.self = link['href'] %} +{% endif %} +{% if link['rel'] == 'next' %} +{% set nav_links.next = link['href'] %} +{% endif %} {% endfor %}
+
+ {% if data['facets'] %} + {% for facet in data['facets'].keys() %} +
+
{{ facet }}
+
+ {% for bucket in data['facets'][facet].buckets %} + {{bucket['value']}} + {{bucket['count']}}
+ {% endfor %} +
+
+ {% endfor %} + {% endif %} +
{% if nav_links.prev %} From 308a1f3fe11a29b61d6c7b5b7db3b1c16bdff333 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sun, 12 Nov 2023 15:53:20 -0500 Subject: [PATCH 4/4] add facet def to OpenAPI gen --- pycsw/ogc/api/oapi.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pycsw/ogc/api/oapi.py b/pycsw/ogc/api/oapi.py index d657d5d92..4cc547492 100644 --- a/pycsw/ogc/api/oapi.py +++ b/pycsw/ogc/api/oapi.py @@ -187,6 +187,17 @@ def gen_oapi(config, oapi_filepath, mode='ogcapi-records'): }, 'style': 'form' } + oapi['components']['parameters']['facets'] = { + 'name': 'facets', + 'in': 'query', + 'description': 'Whether to include facets in results', + 'schema': { + 'type': 'boolean', + 'default': False + }, + 'style': 'form', + 'explode': False + } # TODO: remove local definition of ids once implemented # in OGC API - Records oapi['components']['parameters']['ids'] = { @@ -385,7 +396,8 @@ def gen_oapi(config, oapi_filepath, mode='ogcapi-records'): {'$ref': '#/components/parameters/filter-lang'}, {'$ref': '#/components/parameters/f'}, {'$ref': '#/components/parameters/offset'}, - {'$ref': '#/components/parameters/vendorSpecificParameters'} + {'$ref': '#/components/parameters/vendorSpecificParameters'}, + {'$ref': '#/components/parameters/facets'}, ], 'responses': { '200': {