+169
-19
lines changedFilter options
+169
-19
lines changed Original file line number Diff line number Diff line change
@@ -522,32 +522,92 @@ def _coalesce(
522
522
return default
523
523
524
524
525
+
_RFC3986_SUBDELIMS = "!$&'()*+,;="
526
+
"""
527
+
``sub-delims`` production from `RFC 3986, section 2.2
528
+
<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_.
529
+
"""
530
+
531
+
_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"
532
+
"""
533
+
The non-unreserved characters in the ``pchar`` production from RFC 3986.
534
+
"""
535
+
536
+
_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"
537
+
"""
538
+
The non-unreserved characters that are safe to use in in the query and fragment
539
+
components.
540
+
541
+
.. code-block::
542
+
543
+
pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query
544
+
= *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )
545
+
"""
546
+
547
+
_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"
548
+
"""
549
+
The non-unreserved characters that are safe to use in the username and password
550
+
components.
551
+
552
+
.. code-block::
553
+
554
+
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
555
+
556
+
":" is excluded as this is only used for the username and password components,
557
+
and they are treated separately.
558
+
"""
559
+
560
+
_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"
561
+
"""
562
+
The non-unreserved characters that are safe to use in the path component.
563
+
564
+
565
+
This is based on various path-related productions from RFC 3986.
566
+
"""
567
+
568
+
525
569
def _iri2uri(iri: str) -> str:
526
570
"""
527
-
Convert an IRI to a URI (Python 3).
528
-
https://stackoverflow.com/a/42309027
529
-
https://stackoverflow.com/a/40654295
530
-
netloc should be encoded using IDNA;
531
-
non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
532
-
non-ascii query parameters should be encoded to the encoding of a page
533
-
URL was extracted from (or to the encoding server uses), then
534
-
percent-escaped.
571
+
Prior art:
572
+
573
+
* `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_
574
+
535
575
>>> _iri2uri("https://dbpedia.org/resource/Almería")
536
576
'https://dbpedia.org/resource/Almer%C3%ADa'
537
577
"""
578
+
# https://datatracker.ietf.org/doc/html/rfc3986
538
579
# https://datatracker.ietf.org/doc/html/rfc3305
539
580
540
-
(scheme, netloc, path, query, fragment) = urlsplit(iri)
581
+
parts = urlsplit(iri)
582
+
(scheme, netloc, path, query, fragment) = parts
541
583
542
-
# Just support http/https, otherwise return the iri unmolested
584
+
# Just support http/https, otherwise return the iri unaltered
543
585
if scheme not in ["http", "https"]:
544
586
return iri
545
587
546
-
scheme = quote(scheme)
547
-
netloc = netloc.encode("idna").decode("utf-8")
548
-
path = quote(path)
549
-
query = quote(query)
550
-
fragment = quote(fragment)
588
+
path = quote(path, safe=_PATH_SAFE_CHARS)
589
+
query = quote(query, safe=_QUERY_SAFE_CHARS)
590
+
fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)
591
+
592
+
if parts.hostname:
593
+
netloc = parts.hostname.encode("idna").decode("ascii")
594
+
else:
595
+
netloc = ""
596
+
597
+
if ":" in netloc:
598
+
# Quote IPv6 addresses
599
+
netloc = f"[{netloc}]"
600
+
601
+
if parts.port:
602
+
netloc = f"{netloc}:{parts.port}"
603
+
604
+
if parts.username:
605
+
auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)
606
+
if parts.password:
607
+
pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)
608
+
auth = f"{auth}:{pass_quoted}"
609
+
netloc = f"{auth}@{netloc}"
610
+
551
611
uri = urlunsplit((scheme, netloc, path, query, fragment))
552
612
553
613
if iri.endswith("#") and not uri.endswith("#"):
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
1
+
import logging
1
2
import re
2
3
from http.server import BaseHTTPRequestHandler
3
4
from test.data import TEST_DATA_DIR
4
5
from test.utils import GraphHelper
5
6
from test.utils.graph import cached_graph
6
-
from test.utils.http import ctx_http_handler
7
+
from test.utils.http import (
8
+
MOCK_HTTP_REQUEST_WILDCARD,
9
+
MockHTTPRequest,
10
+
ctx_http_handler,
11
+
)
7
12
from test.utils.httpservermock import (
8
13
MethodName,
9
14
MockHTTPResponse,
10
15
ServedBaseHTTPServerMock,
11
16
)
17
+
from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD
12
18
from urllib.error import HTTPError
13
19
14
20
import pytest
@@ -235,7 +241,34 @@ def test_5xx(self):
235
241
assert raised.value.code == 500
236
242
237
243
238
-
def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
244
+
@pytest.mark.parametrize(
245
+
["url_suffix", "expected_request"],
246
+
[
247
+
(
248
+
"/resource/Almería",
249
+
MOCK_HTTP_REQUEST_WILDCARD._replace(
250
+
path="/resource/Almer%C3%ADa",
251
+
parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
252
+
path="/resource/Almer%C3%ADa"
253
+
),
254
+
),
255
+
),
256
+
(
257
+
"/resource/Almería?foo=bar",
258
+
MOCK_HTTP_REQUEST_WILDCARD._replace(
259
+
parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
260
+
path="/resource/Almer%C3%ADa"
261
+
),
262
+
path_query={"foo": ["bar"]},
263
+
),
264
+
),
265
+
],
266
+
)
267
+
def test_iri_source(
268
+
url_suffix: str,
269
+
expected_request: MockHTTPRequest,
270
+
function_httpmock: ServedBaseHTTPServerMock,
271
+
) -> None:
239
272
diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl"
240
273
241
274
function_httpmock.responses[MethodName.GET].append(
@@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
247
280
)
248
281
)
249
282
g = Graph()
250
-
g.parse(f"{function_httpmock.url}/resource/Almería")
283
+
g.parse(f"{function_httpmock.url}{url_suffix}")
251
284
assert function_httpmock.call_count == 1
252
285
GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g)
286
+
assert len(g) > 1
253
287
254
288
req = function_httpmock.requests[MethodName.GET].pop(0)
255
-
assert req.path == "/resource/Almer%C3%ADa"
289
+
logging.debug("req = %s", req)
290
+
assert expected_request == req
Original file line number Diff line number Diff line change
@@ -635,6 +635,24 @@ def test_get_tree(
635
635
"http://example.com:1231/",
636
636
},
637
637
),
638
+
(
639
+
"http://example.com:1231/a=b",
640
+
{
641
+
"http://example.com:1231/a=b",
642
+
},
643
+
),
644
+
(
645
+
"http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d",
646
+
{
647
+
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
648
+
},
649
+
),
650
+
(
651
+
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
652
+
{
653
+
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
654
+
},
655
+
),
638
656
],
639
657
)
640
658
def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None:
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
4
4
import random
5
5
from contextlib import contextmanager
6
6
from http.server import BaseHTTPRequestHandler, HTTPServer
7
+
from test.utils.wildcard import EQ_WILDCARD
7
8
from threading import Thread
8
9
from typing import (
9
10
Dict,
@@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple):
62
63
body: Optional[bytes]
63
64
64
65
66
+
MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest(
67
+
EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
68
+
)
69
+
"""
70
+
This object should be equal to any `MockHTTPRequest` object.
71
+
"""
72
+
73
+
65
74
class MockHTTPResponse(NamedTuple):
66
75
status_code: int
67
76
reason_phrase: str
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
1
+
from typing import Any
2
+
from urllib.parse import ParseResult
3
+
4
+
5
+
class EqWildcard:
6
+
"""
7
+
An object that matches anything.
8
+
"""
9
+
10
+
def __eq__(self, other: Any) -> Any:
11
+
return True
12
+
13
+
def __req__(self, other: Any) -> Any:
14
+
return True
15
+
16
+
def __repr__(self) -> str:
17
+
return "EqWildcard()"
18
+
19
+
20
+
EQ_WILDCARD: Any = EqWildcard()
21
+
22
+
23
+
URL_PARSE_RESULT_WILDCARD = ParseResult(
24
+
EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
25
+
)
26
+
"""
27
+
This should be equal to any `ParseResult` object.
28
+
"""
You can’t perform that action at this time.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4