From 4acf1f6268cebabbcf5c7b444c98da0a5b348947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martynas=20Jusevi=C4=8Dius?= Date: Sun, 7 Jun 2026 23:04:20 +0200 Subject: [PATCH 1/2] Extract JSON-LD from HTML on the proxy path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HtmlJsonLDReader replaces the old JsonLDReader: a thin Jsoup-based wrapper that finds every +EOF +) \ +| grep -q "$STATUS_NO_CONTENT" + +# check that the triple from the embedded JSON-LD is queryable + +curl -k -f -s -G \ + -E "$AGENT_CERT_FILE":"$AGENT_CERT_PWD" \ + -H "Accept: application/n-triples" \ +"$END_USER_BASE_URL" \ +| tr -d '\n' \ +| grep '"named object HTML/JSON-LD POST"' > /dev/null diff --git a/http-tests/proxy/GET-proxied-html-jsonld.sh b/http-tests/proxy/GET-proxied-html-jsonld.sh new file mode 100755 index 0000000000..6a3e11a303 --- /dev/null +++ b/http-tests/proxy/GET-proxied-html-jsonld.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -euo pipefail + +initialize_dataset "$END_USER_BASE_URL" "$TMP_END_USER_DATASET" "$END_USER_ENDPOINT_URL" +initialize_dataset "$ADMIN_BASE_URL" "$TMP_ADMIN_DATASET" "$ADMIN_ENDPOINT_URL" +purge_cache "$END_USER_VARNISH_SERVICE" +purge_cache "$ADMIN_VARNISH_SERVICE" +purge_cache "$FRONTEND_VARNISH_SERVICE" + +# add agent to the readers group to be able to read documents + +add-agent-to-group.sh \ + -f "$OWNER_CERT_FILE" \ + -p "$OWNER_CERT_PWD" \ + --agent "$AGENT_URI" \ + "${ADMIN_BASE_URL}acl/groups/readers/" + +# Regression: when an upstream proxied URI returns text/html (e.g. a schema.org term page) +# but embeds JSON-LD via + + """.formatted(EX); + + Model model = parse(html); + + assertTrue(model.contains( + ResourceFactory.createResource(EX + "alice"), + ResourceFactory.createProperty(EX, "name"), + "Alice")); + assertTrue(model.contains( + ResourceFactory.createResource(EX + "alice"), + org.apache.jena.vocabulary.RDF.type, + ResourceFactory.createResource(EX + "Person"))); + } + + @Test + public void testMultipleScriptsAreMerged() + { + String html = """ + + + + + """.formatted(EX, EX); + + Model model = parse(html); + + assertTrue(model.contains( + ResourceFactory.createResource(EX + "alice"), + ResourceFactory.createProperty(EX, "name"), + "Alice")); + assertTrue(model.contains( + ResourceFactory.createResource(EX + "bob"), + ResourceFactory.createProperty(EX, "name"), + "Bob")); + } + + @Test + public void testMissingScriptThrows() + { + String html = "no jsonld

nothing

"; + + assertThrows(RiotParseException.class, () -> parse(html)); + } + + @Test + public void testOtherScriptTypesIgnored() + { + // a non-ld+json + + + """.formatted(EX); + + Model model = parse(html); + + assertTrue(model.contains( + ResourceFactory.createResource(EX + "alice"), + ResourceFactory.createProperty(EX, "name"), + "Alice")); + assertFalse(model.contains( + ResourceFactory.createResource(EX + "js"), + ResourceFactory.createProperty(EX, "name"), + "JS")); + } + + @Test + public void testSameOutputAsDirectJsonLdParse() + { + // the HTML reader must be a transparent wrapper around Jena's JSON-LD11 reader: + // wrapping the same payload in HTML must yield exactly the same model as parsing the payload directly + String jsonLd = """ + { + "@context": {"ex": "%s", "name": {"@id": "ex:name"}}, + "@id": "ex:alice", + "@type": "ex:Person", + "name": "Alice" + } + """.formatted(EX); + String html = ""; + + Model direct = ModelFactory.createDefaultModel(); + RDFParser.create(). + source(new ByteArrayInputStream(jsonLd.getBytes(StandardCharsets.UTF_8))). + lang(Lang.JSONLD11). + base(BASE_URI). + parse(StreamRDFLib.graph(direct.getGraph())); + + Model viaHtml = parse(html); + + assertEquals(direct.size(), viaHtml.size()); + assertTrue(direct.isIsomorphicWith(viaHtml)); + } + +} From 876b72ed717003a23bb47bab91c61f8b49cf0545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martynas=20Jusevi=C4=8Dius?= Date: Sun, 7 Jun 2026 23:24:03 +0200 Subject: [PATCH 2/2] Drop debug echos from GET-proxied-html-jsonld Match the terse failure style of the sibling proxy tests; the assertion logic is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- http-tests/proxy/GET-proxied-html-jsonld.sh | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/http-tests/proxy/GET-proxied-html-jsonld.sh b/http-tests/proxy/GET-proxied-html-jsonld.sh index 6a3e11a303..28e4fd9722 100755 --- a/http-tests/proxy/GET-proxied-html-jsonld.sh +++ b/http-tests/proxy/GET-proxied-html-jsonld.sh @@ -34,10 +34,7 @@ content_type=$(curl -k -f -s -G -w "%{content_type}" -o /tmp/proxy-html-jsonld.b case "$content_type" in text/turtle*) ;; - *) - echo "DEBUG: Expected Content-Type: text/turtle*, got: $content_type" >&2 - exit 1 - ;; + *) exit 1 ;; esac # 2. The body must parse as turtle and contain at least one triple with @@ -47,12 +44,6 @@ esac triple_count=$(rapper -q --input turtle --output ntriples /tmp/proxy-html-jsonld.body - \ | grep -c "^<${target_uri}>" || true) -if [ "$triple_count" -lt 1 ]; then - echo "DEBUG: Expected at least one triple with <${target_uri}> as subject, got: $triple_count" >&2 - echo "DEBUG: Body preview:" >&2 - head -c 500 /tmp/proxy-html-jsonld.body >&2 - echo >&2 - exit 1 -fi - rm -f /tmp/proxy-html-jsonld.body + +if [ "$triple_count" -lt 1 ]; then exit 1; fi