diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 8c913e0a..54ab026b 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -171,17 +171,6 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False) try: - save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} - try: - with UnknownEncodingStream(csv_filepath, file_format, decoding_result, - skip_rows=skip_rows) as stream: - stream.save(**save_args) - except (EncodingError, UnicodeDecodeError): - with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, - skip_rows=skip_rows) as stream: - stream.save(**save_args) - csv_filepath = f_write.name - # datastore db connection engine = get_write_engine() @@ -189,10 +178,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = {} if existing: - existing_fields = existing.get('fields', []) + if p.toolkit.check_ckan_version(min_version='2.10'): + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) + else: + existing_fields = existing.get('fields', []) existing_info = dict((f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Column types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) @@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] + f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \ + else existing_fields_by_headers[f['id']].get('strip_extra_white', True) ''' Delete or truncate existing datastore table before proceeding, @@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): else: fields = [ {'id': header_name, - 'type': 'text'} + 'type': 'text', + 'strip_extra_white': True,} for header_name in headers] logger.info('Fields: %s', fields) + save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} + try: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + except (EncodingError, UnicodeDecodeError): + with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + csv_filepath = f_write.name + # Create table from ckan import model context = {'model': model, 'ignore_auth': True} @@ -371,10 +400,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = None if existing: - existing_fields = existing.get('fields', []) + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) existing_info = dict( (f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) @@ -388,6 +420,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): strict_guessing = p.toolkit.asbool( config.get('ckanext.xloader.strict_type_guessing', True)) types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing) + fields = [] # override with types user requested if existing_info: @@ -398,9 +431,15 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): 'timestamp': datetime.datetime, }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] + for h in headers: + fields.append(existing_fields_by_headers.get(h, {})) + else: + # default strip_extra_white + for h in headers: + fields.append({'strip_extra_white': True}) headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] - type_converter = TypeConverter(types=types) + type_converter = TypeConverter(types=types, fields=fields) with UnknownEncodingStream(table_filepath, file_format, decoding_result, skip_rows=skip_rows, @@ -421,10 +460,16 @@ def row_iterator(): for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] + h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \ + else existing_fields_by_headers[h['id']].get('strip_extra_white', True) # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override + else: + # default strip_extra_white + for h in headers_dicts: + h['strip_extra_white'] = True logger.info('Determined headers and types: %s', headers_dicts) diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index 11e756cd..26193203 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -18,8 +18,9 @@ class TypeConverter: as desired. """ - def __init__(self, types=None): + def __init__(self, types=None, fields=None): self.types = types + self.fields = fields def convert_types(self, extended_rows): """ Try converting cells to numbers or timestamps if applicable. @@ -31,7 +32,16 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' + if self.fields: + # only strip white space if strip_extra_white is True + if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, six.text_type): + cell_value = cell_value.strip() + row[cell_index] = cell_value.strip() if not cell_value: + # load_csv parody: empty of string type should be None + if self.types and self.types[cell_index] == six.text_type: + cell_value = None + row[cell_index] = None continue cell_type = self.types[cell_index] if self.types else None if cell_type in [Decimal, None]: diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 05d629c8..e8268776 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -20,6 +20,12 @@ def config_declarations(cls): return cls +if toolkit.check_ckan_version(min_version='2.11'): + from ckanext.datastore.interfaces import IDataDictionaryForm + has_idata_dictionary_form = True +else: + has_idata_dictionary_form = False + log = logging.getLogger(__name__) @@ -34,6 +40,8 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceController, inherit=True) plugins.implements(plugins.IClick) plugins.implements(plugins.IBlueprint) + if has_idata_dictionary_form: + plugins.implements(IDataDictionaryForm, inherit=True) # IClick def get_commands(self): @@ -208,6 +216,23 @@ def get_helpers(self): "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, } + # IDataDictionaryForm + + def update_datastore_create_schema(self, schema): + default = toolkit.get_validator('default') + boolean_validator = toolkit.get_validator('boolean_validator') + to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data') + schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')] + return schema + + def update_datastore_info_field(self, field, plugin_data): + # expose all our non-secret plugin data in the field + field.update(plugin_data.get('xloader', {})) + # CKAN version parody + if '_info' in plugin_data: + field.update({'info': plugin_data['_info']}) + return field + def _should_remove_unsupported_resource_from_datastore(res_dict): if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html new file mode 100644 index 00000000..808aa764 --- /dev/null +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -0,0 +1,17 @@ +{% ckan_extends %} +{% import 'macros/form.html' as form %} + +{% block additional_fields %} + {{ super() }} + {% if h.check_ckan_version(min_version='2.11') %} + {% set field_prefix = 'fields__' %} + {% else %} + {% set field_prefix = 'info__' %} + {% endif %} + {% set is_selected = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white')) != 'False' %} + {{ form.select(field_prefix ~ position ~ '__strip_extra_white', + label=_('Strip Extra Leading and Trailing White Space'), options=[ + {'text': _('Yes'), 'value': true}, + {'text': _('No'), 'value': false}, + ], selected=is_selected) }} +{% endblock %} diff --git a/ckanext/xloader/tests/samples/boston_311_sample.csv b/ckanext/xloader/tests/samples/boston_311_sample.csv index 83e0d5f2..e3a7e5be 100644 --- a/ckanext/xloader/tests/samples/boston_311_sample.csv +++ b/ckanext/xloader/tests/samples/boston_311_sample.csv @@ -1,4 +1,4 @@ -CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source -101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App -101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App -101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App +CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source +101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App +101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App +101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index e819dad9..62ae7174 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data): with mock.patch("ckanext.xloader.jobs.get_response", get_response): stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout - assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout + assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout assert "Copying to database..." in stdout assert "Creating search index..." in stdout assert "Express Load completed" in stdout diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index e8816a13..ba1b9288 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -102,6 +102,20 @@ def test_simple(self, Session): logger=logger, ) + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) assert self._get_records( Session, resource_id, limit=1, exclude_full_text_column=False ) == [ @@ -113,7 +127,8 @@ def test_simple(self, Session): u"Galway", ) ] - assert self._get_records(Session, resource_id) == [ + print(records) + assert records == [ (1, u"2011-01-01", u"1", u"Galway"), (2, u"2011-01-02", u"-1", u"Galway"), (3, u"2011-01-03", u"0", u"Galway"), @@ -121,20 +136,6 @@ def test_simple(self, Session): (5, None, None, u"Berkeley"), (6, u"2011-01-03", u"5", None), ] - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"date", - u"temperature", - u"place", - ] - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"text", - u"text", - u"text", - ] def test_simple_with_indexing(self, Session): csv_filepath = get_sample_filepath("simple.csv") @@ -217,6 +218,45 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) print(records) assert records == [ ( @@ -227,9 +267,9 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", @@ -259,14 +299,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces None, u"522 Saratoga St East Boston MA 02128", u"1", @@ -291,14 +331,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces None, u"965 Bennington St East Boston MA 02128", u"1", @@ -316,45 +356,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - ] + [u"text"] * (len(records[0]) - 1) def test_brazilian(self, Session): csv_filepath = get_sample_filepath("brazilian_sample.csv") @@ -368,105 +369,6 @@ def test_brazilian(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"01/01/1996 12:00:00 AM", - u"1100015", - u"ALTA FLORESTA D'OESTE", - u"RO", - None, - u"128", - u"0", - u"8", - u"119", - u"1", - u"0", - u"3613", - u"3051", - u"130", - u"7", - u"121", - u"3716", - u"3078", - u"127", - u"7", - None, - None, - None, - None, - u"6794", - u"5036", - u"1758", - None, - None, - None, - None, - None, - None, - u"337", - u"0.26112759", - u"0.17210683", - u"0.43323442", - u"0.13353115", - u"24.833692447908199", - None, - None, - u"22.704964", - u"67.080006197818605", - u"65.144188573097907", - u"74.672390253375497", - u"16.7913561569619", - u"19.4894563570641", - u"8.649237411458509", - u"7.60165422117368", - u"11.1540090366186", - u"17.263407056738099", - u"8.5269823", - u"9.2213373", - u"5.3085136", - u"52.472769803217503", - None, - None, - None, - None, - None, - None, - u"25.0011414302354", - u"22.830887000000001", - u"66.8150490097632", - u"64.893674212235595", - u"74.288246611754104", - u"17.0725384713319", - u"19.8404105332814", - u"8.856561911292371", - u"7.74275834336647", - u"11.357671741889", - u"17.9410577459881", - u"8.3696527", - u"8.9979973", - u"5.0570836", - u"53.286314230720798", - None, - None, - None, - None, - None, - u"122988", - None, - u"10.155015000000001", - u"14.826086999999999", - u"11.671533", - u"9.072917", - None, - None, - None, - None, - None, - None, - None, - None, - ) # noqa print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -572,6 +474,105 @@ def test_brazilian(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"01/01/1996 12:00:00 AM", + u"1100015", + u"ALTA FLORESTA D'OESTE", + u"RO", + None, + u"128", + u"0", + u"8", + u"119", + u"1", + u"0", + u"3613", + u"3051", + u"130", + u"7", + u"121", + u"3716", + u"3078", + u"127", + u"7", + None, + None, + None, + None, + u"6794", + u"5036", + u"1758", + None, + None, + None, + None, + None, + None, + u"337", + u"0.26112759", + u"0.17210683", + u"0.43323442", + u"0.13353115", + u"24.833692447908199", + None, + None, + u"22.704964", + u"67.080006197818605", + u"65.144188573097907", + u"74.672390253375497", + u"16.7913561569619", + u"19.4894563570641", + u"8.649237411458509", + u"7.60165422117368", + u"11.1540090366186", + u"17.263407056738099", + u"8.5269823", + u"9.2213373", + u"5.3085136", + u"52.472769803217503", + None, + None, + None, + None, + None, + None, + u"25.0011414302354", + u"22.830887000000001", + u"66.8150490097632", + u"64.893674212235595", + u"74.288246611754104", + u"17.0725384713319", + u"19.8404105332814", + u"8.856561911292371", + u"7.74275834336647", + u"11.357671741889", + u"17.9410577459881", + u"8.3696527", + u"8.9979973", + u"5.0570836", + u"53.286314230720798", + None, + None, + None, + None, + None, + u"122988", + None, + u"10.155015000000001", + u"14.826086999999999", + u"11.671533", + u"9.072917", + None, + None, + None, + None, + None, + None, + None, + None, + ) # noqa def test_german(self, Session): csv_filepath = get_sample_filepath("german_sample.csv") @@ -585,20 +586,6 @@ def test_german(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"Zürich", - u"68260", - u"65444", - u"62646", - u"6503", - u"28800", - u"1173", - u"6891", - u"24221", - u"672", - ) print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -619,6 +606,20 @@ def test_german(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"Zürich", + u"68260", + u"65444", + u"62646", + u"6503", + u"28800", + u"1173", + u"6891", + u"24221", + u"672", + ) def test_with_blanks(self, Session): csv_filepath = get_sample_filepath("sample_with_blanks.csv") @@ -699,7 +700,6 @@ def test_reload(self, Session): logger=logger, ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -714,6 +714,7 @@ def test_reload(self, Session): u"text", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 @pytest.mark.skipif( not p.toolkit.check_ckan_version(min_version="2.7"), @@ -752,7 +753,6 @@ def test_reload_with_overridden_types(self, Session): fields=fields, resource_id=resource_id, logger=logger ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -767,6 +767,7 @@ def test_reload_with_overridden_types(self, Session): u"numeric", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 # check that rows with nulls are indexed correctly records = self._get_records( @@ -816,6 +817,181 @@ def test_column_names(self, Session): u"Galway", ) + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + field["info"] = {"strip_extra_white": False} # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records == [ + ( + 4, # ds auto increment + u"101002153891", + u"2017-07-06 23:38:43", + u"2017-07-21 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + u"8", + u"07", + u"4", + u"B3", + u"Greater Mattapan", + u"9", + u"Ward 14", + u"1411", + u"480 Harvard St", + u"02124", + u"42.288", + u"-71.0927", + u"Citizens Connect App", + ), # noqa + ( + 5, # ds auto increment + u"101002153890", + u"2017-07-06 23:29:13", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0110", + u"522 Saratoga St", + u"02128", + u"42.3807", + u"-71.0259", + u"Citizens Connect App", + ), # noqa + ( + 6, # ds auto increment + u"101002153889", + u"2017-07-06 23:24:20", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0112", + u"965 Bennington St", + u"02128", + u"42.386", + u"-71.008", + u"Citizens Connect App", + ), + ] # noqa + class TestLoadUnhandledTypes(TestLoadBase): def test_kml(self): @@ -919,6 +1095,20 @@ def test_simple(self, Session): # "'-01':4,5 '00':6,7,8 '1':1 '2011':3 'galway':2" # "'-01':2,3 '00':5,6 '1':7 '2011':1 'galway':8 't00':4" + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + u"timestamp", + u"numeric", + u"text", + ] assert self._get_records(Session, resource_id) == [ (1, datetime.datetime(2011, 1, 1, 0, 0), Decimal("1"), u"Galway",), ( @@ -947,20 +1137,6 @@ def test_simple(self, Session): u"Berkeley", ), ] - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"date", - u"temperature", - u"place", - ] - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"timestamp", - u"numeric", - u"text", - ] def test_simple_large_file(self, Session): csv_filepath = get_sample_filepath("simple-large.csv") @@ -1060,6 +1236,74 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa print(records) assert records == [ ( @@ -1067,18 +1311,18 @@ def test_boston_311(self, Session): Decimal("101002153891"), datetime.datetime(2017, 7, 6, 23, 38, 43), datetime.datetime(2017, 7, 21, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", u"PWDx", - u"", - u"", + None, + None, u"480 Harvard St Dorchester MA 02124", Decimal("8"), Decimal("7"), @@ -1099,18 +1343,18 @@ def test_boston_311(self, Session): Decimal("101002153890"), datetime.datetime(2017, 7, 6, 23, 29, 13), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", - u"", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces + None, u"522 Saratoga St East Boston MA 02128", Decimal("1"), Decimal("9"), @@ -1131,18 +1375,18 @@ def test_boston_311(self, Session): Decimal("101002153889"), datetime.datetime(2017, 7, 6, 23, 24, 20), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", - u"", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces + None, u"965 Bennington St East Boston MA 02128", Decimal("1"), Decimal("9"), @@ -1159,74 +1403,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"numeric", - u"timestamp", - u"timestamp", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - u"text", - u"numeric", - u"text", - u"numeric", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - ] # noqa def test_no_entries(self): csv_filepath = get_sample_filepath("no_entries.csv") @@ -1298,3 +1474,207 @@ def test_preserving_time_ranges(self, Session): (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"), "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20)) ] + + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + field["info"] = {"strip_extra_white": False} # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa + print(records) + assert records == [ + ( + 4, # ds auto increment + Decimal("101002153891"), + datetime.datetime(2017, 7, 6, 23, 38, 43), + datetime.datetime(2017, 7, 21, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + Decimal("8"), + Decimal("7"), + Decimal("4"), + u"B3", + u"Greater Mattapan", + Decimal("9"), + u"Ward 14", + Decimal("1411"), + u"480 Harvard St", + Decimal("2124"), + Decimal("42.288"), + Decimal("-71.0927"), + u"Citizens Connect App", + ), # noqa + ( + 5, # ds auto increment + Decimal("101002153890"), + datetime.datetime(2017, 7, 6, 23, 29, 13), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("110"), + u"522 Saratoga St", + Decimal("2128"), + Decimal("42.3807"), + Decimal("-71.0259"), + u"Citizens Connect App", + ), # noqa + ( + 6, # ds auto increment + Decimal("101002153889"), + datetime.datetime(2017, 7, 6, 23, 24, 20), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("112"), + u"965 Bennington St", + Decimal("2128"), + Decimal("42.386"), + Decimal("-71.008"), + u"Citizens Connect App", + ), + ] # noqa diff --git a/requirements.txt b/requirements.txt index fe92b6d7..ce7cd03e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -ckantoolkit +ckantoolkit>=0.0.4 requests[security]>=2.11.1 six>=1.12.0 tabulator==1.53.5 Unidecode==1.0.22 python-dateutil>=2.8.2 -chardet==5.2.0 \ No newline at end of file +chardet==5.2.0