Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip White Space from Cell Values #215

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 53 additions & 15 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,28 +170,20 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()

# get column info from existing table
existing = datastore_resource_exists(resource_id)
existing_info = {}
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict((f['id'], f['info'])
for f in existing_fields
if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Column types are either set (overridden) in the Data Dictionary page
# or default to text type (which is robust)
Expand All @@ -206,6 +198,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
for f in fields:
if f['id'] in existing_info:
f['info'] = existing_info[f['id']]
f['strip_extra_white'] = existing_fields_by_headers[f['id']].get('strip_extra_white', True)

'''
Delete or truncate existing datastore table before proceeding,
Expand All @@ -222,11 +215,41 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
else:
fields = [
{'id': header_name,
'type': 'text'}
'type': 'text',
'strip_extra_white': True,}
for header_name in headers]

logger.info('Fields: %s', fields)

save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# Create table
from ckan import model
context = {'model': model, 'ignore_auth': True}
Expand Down Expand Up @@ -370,10 +393,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = None
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict(
(f['id'], f['info'])
for f in existing_fields if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
Expand All @@ -385,6 +411,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):

TYPES, TYPE_MAPPING = get_types()
types = type_guess(stream.sample[1:], types=TYPES, strict=True)
fields = []

# override with types user requested
if existing_info:
Expand All @@ -395,9 +422,15 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
'timestamp': datetime.datetime,
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
fields.append(existing_fields_by_headers.get(h, {}))
else:
# default strip_extra_white
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
for h in headers:
fields.append({'strip_extra_white': True})

headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
type_converter = TypeConverter(types=types, fields=fields)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand All @@ -418,10 +451,15 @@ def row_iterator():
for h in headers_dicts:
if h['id'] in existing_info:
h['info'] = existing_info[h['id']]
h['strip_extra_white'] = existing_fields_by_headers[h['id']].get('strip_extra_white', True)
# create columns with types user requested
type_override = existing_info[h['id']].get('type_override')
if type_override in list(_TYPE_MAPPING.values()):
h['type'] = type_override
else:
# default strip_extra_white
for h in headers_dicts:
h['strip_extra_white'] = True

logger.info('Determined headers and types: %s', headers_dicts)

Expand Down
8 changes: 7 additions & 1 deletion ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None):
def __init__(self, types=None, fields=None):
self.types = types
self.fields = fields

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -31,6 +32,11 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if self.fields:
# only strip white space if strip_extra_white is True
if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
continue
cell_type = self.types[cell_index] if self.types else None
Expand Down
16 changes: 16 additions & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ckan import plugins
from ckan.plugins import toolkit
from ckanext.datastore.interfaces import IDataDictionaryForm

from ckan.model.domain_object import DomainObjectOperation
from ckan.model.resource import Resource
Expand Down Expand Up @@ -34,6 +35,7 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
plugins.implements(IDataDictionaryForm, inherit=True)

# IClick
def get_commands(self):
Expand Down Expand Up @@ -207,6 +209,20 @@ def get_helpers(self):
"is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
default = toolkit.get_validator('default')
boolean_validator = toolkit.get_validator('boolean_validator')
to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
return schema

def update_datastore_info_field(self, field, plugin_data):
# expose all our non-secret plugin data in the field
field.update(plugin_data.get('xloader', {}))
return field


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
11 changes: 11 additions & 0 deletions ckanext/xloader/templates/datastore/snippets/dictionary_form.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{% ckan_extends %}
{% import 'macros/form.html' as form %}

{% block additional_fields %}
{{ super() }}
{{ form.select('fields__' ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': 'Yes', 'value': true},
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
{'text': 'No', 'value': false},
], selected=field.get('strip_extra_white')) }}
{% endblock %}
20 changes: 10 additions & 10 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
u"Street Light Outages",
u"Public Works Department",
u"Street Lights",
Expand Down Expand Up @@ -259,14 +259,14 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces
None,
u"522 Saratoga St East Boston MA 02128",
u"1",
Expand All @@ -291,14 +291,14 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
None,
u"965 Bennington St East Boston MA 02128",
u"1",
Expand Down Expand Up @@ -1049,7 +1049,7 @@ def test_boston_311(self, Session):
u"",
u"ONTIME",
u"Open",
u" ",
u"", # " " transforms to ""
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
u"Street Light Outages",
u"Public Works Department",
u"Street Lights",
Expand Down Expand Up @@ -1081,14 +1081,14 @@ def test_boston_311(self, Session):
u"",
u"ONTIME",
u"Open",
u" ",
u"", # " " transforms to ""
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces
u"",
u"522 Saratoga St East Boston MA 02128",
Decimal("1"),
Expand All @@ -1113,14 +1113,14 @@ def test_boston_311(self, Session):
u"",
u"ONTIME",
u"Open",
u" ",
u"", # " " transforms to ""
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces
u"",
u"965 Bennington St East Boston MA 02128",
Decimal("1"),
Expand Down
Loading