Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip White Space from Cell Values #215

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 60 additions & 15 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,28 +171,23 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()

# get column info from existing table
existing = datastore_resource_exists(resource_id)
existing_info = {}
if existing:
existing_fields = existing.get('fields', [])
if p.toolkit.check_ckan_version(min_version='2.10'):
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
else:
existing_fields = existing.get('fields', [])
existing_info = dict((f['id'], f['info'])
for f in existing_fields
if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Column types are either set (overridden) in the Data Dictionary page
# or default to text type (which is robust)
Expand All @@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
for f in fields:
if f['id'] in existing_info:
f['info'] = existing_info[f['id']]
f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \
else existing_fields_by_headers[f['id']].get('strip_extra_white', True)

'''
Delete or truncate existing datastore table before proceeding,
Expand All @@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
else:
fields = [
{'id': header_name,
'type': 'text'}
'type': 'text',
'strip_extra_white': True,}
for header_name in headers]

logger.info('Fields: %s', fields)

save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# Create table
from ckan import model
context = {'model': model, 'ignore_auth': True}
Expand Down Expand Up @@ -371,10 +400,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = None
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict(
(f['id'], f['info'])
for f in existing_fields if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
Expand All @@ -388,6 +420,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
strict_guessing = p.toolkit.asbool(
config.get('ckanext.xloader.strict_type_guessing', True))
types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)
fields = []

# override with types user requested
if existing_info:
Expand All @@ -398,9 +431,15 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
'timestamp': datetime.datetime,
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
fields.append(existing_fields_by_headers.get(h, {}))
else:
# default strip_extra_white
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
for h in headers:
fields.append({'strip_extra_white': True})

headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
type_converter = TypeConverter(types=types, fields=fields)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand All @@ -421,10 +460,16 @@ def row_iterator():
for h in headers_dicts:
if h['id'] in existing_info:
h['info'] = existing_info[h['id']]
h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \
else existing_fields_by_headers[h['id']].get('strip_extra_white', True)
# create columns with types user requested
type_override = existing_info[h['id']].get('type_override')
if type_override in list(_TYPE_MAPPING.values()):
h['type'] = type_override
else:
# default strip_extra_white
for h in headers_dicts:
h['strip_extra_white'] = True

logger.info('Determined headers and types: %s', headers_dicts)

Expand Down
12 changes: 11 additions & 1 deletion ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None):
def __init__(self, types=None, fields=None):
self.types = types
self.fields = fields

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -31,7 +32,16 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if self.fields:
# only strip white space if strip_extra_white is True
if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, six.text_type):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
# load_csv parody: empty of string type should be None
if self.types and self.types[cell_index] == six.text_type:
cell_value = None
row[cell_index] = None
continue
cell_type = self.types[cell_index] if self.types else None
if cell_type in [Decimal, None]:
Expand Down
25 changes: 25 additions & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
def config_declarations(cls):
return cls

if toolkit.check_ckan_version(min_version='2.11'):
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
from ckanext.datastore.interfaces import IDataDictionaryForm
has_idata_dictionary_form = True
else:
has_idata_dictionary_form = False

log = logging.getLogger(__name__)


Expand All @@ -34,6 +40,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
if has_idata_dictionary_form:
plugins.implements(IDataDictionaryForm, inherit=True)

# IClick
def get_commands(self):
Expand Down Expand Up @@ -208,6 +216,23 @@ def get_helpers(self):
"is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
default = toolkit.get_validator('default')
boolean_validator = toolkit.get_validator('boolean_validator')
to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
return schema

def update_datastore_info_field(self, field, plugin_data):
# expose all our non-secret plugin data in the field
field.update(plugin_data.get('xloader', {}))
# CKAN version parody
if '_info' in plugin_data:
field.update({'info': plugin_data['_info']})
return field


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
17 changes: 17 additions & 0 deletions ckanext/xloader/templates/datastore/snippets/dictionary_form.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{% ckan_extends %}
{% import 'macros/form.html' as form %}

{% block additional_fields %}
{{ super() }}
{% if h.check_ckan_version(min_version='2.11') %}
{% set field_prefix = 'fields__' %}
{% else %}
{% set field_prefix = 'info__' %}
{% endif %}
{% set is_selected = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white')) != 'False' %}
{{ form.select(field_prefix ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': _('Yes'), 'value': true},
{'text': _('No'), 'value': false},
], selected=is_selected) }}
{% endblock %}
8 changes: 4 additions & 4 deletions ckanext/xloader/tests/samples/boston_311_sample.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
2 changes: 1 addition & 1 deletion ckanext/xloader/tests/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data):
with mock.patch("ckanext.xloader.jobs.get_response", get_response):
stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout
assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout
assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also need tests of the ability to disable this behaviour.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ThrawnCA okay added test coverage for load_csv and load_table with 'strip_extra_white': False

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...welp those didn't work. Worked locally on 2.9 and 2.11, will see if I can replicate and fix em up

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ThrawnCA I cannot seem to be able to replicate the failed tests in any of my environments.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JVickery-TBS can you rebase this and try again to see if the 'tests' fix themselves.

assert "Copying to database..." in stdout
assert "Creating search index..." in stdout
assert "Express Load completed" in stdout
Expand Down
Loading
Loading