Skip to content

Commit

Permalink
Web pages of size greater than 2^20 bytes are not ignored anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucas-C committed Jul 15, 2020
1 parent 068ff0b commit a6ea1c5
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 15 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

## [1.0.4] - 2020-07-15
### Changed
- web pages of size greater than 2<sup>20</sup> bytes are not ignored anymore

## [1.0.3] - 2020-07-09
### Changed
- silenced `InsecureRequestWarning`s
Expand Down
18 changes: 10 additions & 8 deletions linkbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def send_pingback(source_url, target_url, config=LinkbackConfig(), resp_content=
return False
LOGGER.info("Pingback notification sent for URL %s, endpoint response: %s", target_url, response)
return True
except (ConnectionError, HTTPError, RequestException, ResponseTooBig, SSLError) as error:
except (ConnectionError, HTTPError, RequestException, SSLError) as error:
LOGGER.error("Failed to send Pingback for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
return False
except Exception: # unexpected exception => we display the stacktrace:
Expand All @@ -157,7 +157,8 @@ def send_webmention(source_url, target_url, config=LinkbackConfig(), resp_conten
pass
if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
# As a falback, we try parsing the HTML, looking for <link> elements
for link in BeautifulSoup(resp_content, BS4_HTML_PARSER).find_all(rel=WEBMENTION_POSS_REL, href=True):
doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) # HTML parsing could be factord out of both methods
for link in doc_soup.find_all(rel=WEBMENTION_POSS_REL, href=True):
if link.get('href'):
server_uri = link.get('href')
if not server_uri:
Expand All @@ -170,7 +171,7 @@ def send_webmention(source_url, target_url, config=LinkbackConfig(), resp_conten
response.raise_for_status()
LOGGER.info("WebMention notification sent for URL %s, endpoint response: %s", target_url, response.text)
return True
except (ConnectionError, HTTPError, RequestException, ResponseTooBig, SSLError) as error:
except (ConnectionError, HTTPError, RequestException, SSLError) as error:
LOGGER.error("Failed to send WebMention for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
return False
except Exception: # unexpected exception => we display the stacktrace:
Expand All @@ -192,12 +193,12 @@ def requests_get_with_max_size(url, config=LinkbackConfig()):
for chunk in response.iter_content(chunk_size=GET_CHUNK_SIZE, decode_unicode=True):
content += chunk if response.encoding else chunk.decode()
if len(content) >= MAX_RESPONSE_LENGTH:
raise ResponseTooBig("The response for URL {} was too large (> {} bytes).".format(url, MAX_RESPONSE_LENGTH))
# Even truncated, the output is maybe still parsable as HTML to extract <link> tags.
# And if not, the linkback endpoint is maybe present as a HTTP header, so we do not abort and still return the content.
LOGGER.warning("The response for URL {} was too large, and hence was truncated to {} bytes.".format(url, MAX_RESPONSE_LENGTH))
break
return content, response.headers

class ResponseTooBig(Exception):
pass

class XmlRpcTransport(xmlrpc.client.Transport):
def __init__(self, config):
super().__init__()
Expand All @@ -208,7 +209,8 @@ def __init__(self, config):

def make_connection(self, host):
conn = super().make_connection(host)
conn.timeout = self.config.timeout
if self.config.timeout is not None:
conn.timeout = self.config.timeout
return conn

class SafeXmlRpcTransport(xmlrpc.client.SafeTransport):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pelican-plugin-linkbacks"
version = "1.0.3"
version = "1.0.4"
description = "Pelican plugin implementing Linkback protocols, on the linking server side"
authors = ["Lucas Cimon <[email protected]>"]
license = "AGPL-3.0"
Expand Down
28 changes: 22 additions & 6 deletions test_linkbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pelican.generators import ArticlesGenerator
from pelican.tests.support import get_settings

from linkbacks import process_all_articles_linkbacks, CACHE_FILENAME
from linkbacks import process_all_articles_linkbacks, CACHE_FILENAME, LOGGER, MAX_RESPONSE_LENGTH


CUR_DIR = os.path.dirname(__file__)
Expand All @@ -13,6 +13,7 @@

def setup():
logging.root.setLevel(logging.DEBUG)
LOGGER.disable_filter() # disabling LimitFilter log deduping from pelican.log.FatalLogger


@httpretty.activate
Expand Down Expand Up @@ -89,7 +90,21 @@ def test_webmention_http_error(tmpdir, caplog):
assert 'Failed to send WebMention for link url http://localhost/sub/some-page.html' in caplog.text
assert '503' in caplog.text

def _setup_http_mocks(pingback=('header', 'link'), webmention=('header', 'link')):
@httpretty.activate
def test_response_too_big_and_link_in_header(tmpdir, caplog):
_setup_http_mocks(pingback=('header',), webmention=(), fat_html=True)
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 1
assert 'The response for URL http://localhost/sub/some-page.html was too large, and hence was truncated' in caplog.text

@httpretty.activate
def test_response_too_big_and_link_in_html(tmpdir, caplog):
_setup_http_mocks(pingback=('link',), webmention=(), fat_html=True)
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 1
assert 'The response for URL http://localhost/sub/some-page.html was too large, and hence was truncated' in caplog.text

def _setup_http_mocks(pingback=('header', 'link'), webmention=('header', 'link'), fat_html=False):
headers = {'Content-Type': 'text/html'}
if 'header' in pingback:
headers['X-Pingback'] = 'http://localhost/sub/pingback-endpoint'
Expand All @@ -98,7 +113,7 @@ def _setup_http_mocks(pingback=('header', 'link'), webmention=('header', 'link')
httpretty.register_uri(
httpretty.GET, 'http://localhost/sub/some-page.html',
adding_headers=headers,
body=_build_html_content(pingback, webmention)
body=_build_html_content(pingback, webmention, fat_html)
)
# Pingback endpoint:
xmlrpc_body = _build_xmlrpc_success('Pingback registered. Keep the web talking! :-)')
Expand All @@ -118,7 +133,7 @@ def _setup_http_mocks(pingback=('header', 'link'), webmention=('header', 'link')
status=503 if 'http_error' in webmention else 200,
)

def _build_html_content(pingback, webmention):
def _build_html_content(pingback, webmention, fat_html=False):
return '''<!DOCTYPE html>
<html lang="en-US">
<head>
Expand All @@ -129,9 +144,10 @@ def _build_html_content(pingback, webmention):
{webmention_link}
</head>
<body>
Dummy linked content
Dummy linked content {extra_body}
</body>'''.format(pingback_link='<link rel="pingback" href="http://localhost/sub/pingback-endpoint">' if 'link' in pingback else '',
webmention_link='<link rel="webmention" href="http://localhost/sub/webmention-endpoint">' if 'link' in webmention else '')
webmention_link='<link rel="webmention" href="http://localhost/sub/webmention-endpoint">' if 'link' in webmention else '',
extra_body='X'*MAX_RESPONSE_LENGTH if fat_html else '')

def _build_xmlrpc_success(message):
return '''<?xml version="1.0" encoding="UTF-8"?>
Expand Down

0 comments on commit a6ea1c5

Please sign in to comment.