summary refs log tree commit diff
path: root/gnu/packages/python-web.scm
diff options
context:
space:
mode:
authorMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-05-03 16:05:36 -0400
committerMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-05-31 14:52:33 -0400
commit4820a23521727091d0085e5b381aa5c44ebd2ecb (patch)
treee32659de5d2965cff077d3664aa2a9d143478e2a /gnu/packages/python-web.scm
parent0999af5b42540f2d5f4b52c65a7e350f071d2f3c (diff)
downloadguix-4820a23521727091d0085e5b381aa5c44ebd2ecb.tar.gz
gnu: Add python-extruct.
* gnu/packages/python-web.scm (python-extruct): New variable.
Diffstat (limited to 'gnu/packages/python-web.scm')
-rw-r--r--gnu/packages/python-web.scm47
1 files changed, 47 insertions, 0 deletions
diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm
index 59828d7473..427994e22b 100644
--- a/gnu/packages/python-web.scm
+++ b/gnu/packages/python-web.scm
@@ -97,6 +97,7 @@
   #:use-module (gnu packages python-science)
   #:use-module (gnu packages python-xyz)
   #:use-module (gnu packages qt)
+  #:use-module (gnu packages rdf)
   #:use-module (gnu packages rpc)
   #:use-module (gnu packages serialization)
   #:use-module (gnu packages sphinx)
@@ -7441,3 +7442,49 @@ characters in a smarter, more visually pleasing style.")
 implementing the full Microformats2 (mf2) specification, including backward
 compatibility with Microformats1 (mf1).")
     (license license:expat)))
+
+(define-public python-extruct
+  (package
+    (name "python-extruct")
+    (version "0.13.0")
+    (source (origin
+              (method git-fetch)        ;for tests
+              (uri (git-reference
+                    (url "https://github.com/scrapinghub/extruct")
+                    (commit (string-append "v" version))))
+              (file-name (git-file-name name version))
+              (sha256
+               (base32
+                "075zldf3dqcc429z1vk2ngbmv034bnlyk6arh3rh30jbsvz9pzl5"))))
+    (build-system python-build-system)
+    (arguments
+     (list
+      #:phases
+      #~(modify-phases %standard-phases
+          (replace 'check
+            (lambda* (#:key tests? #:allow-other-keys)
+              (when tests?
+                (invoke "pytest" "-vv" "tests")))))))
+    (native-inputs (list python-pytest))
+    (propagated-inputs
+     (list python-html-text
+           python-jstyleson
+           python-lxml
+           python-mf2py
+           python-pyrdfa3
+           python-rdflib
+           python-rdflib-jsonld
+           python-w3lib))
+    (home-page "https://github.com/scrapinghub/extruct")
+    (synopsis "Extract embedded metadata from HTML markup")
+    (description "@code{extruct} is a Python library for extracting embedded
+metadata from HTML markup.  Currently, extruct supports:
+@itemize
+@item W3C's HTML Microdata
+@item embedded JSON-LD
+@item Microformat via mf2py
+@item Facebook's Open Graph
+@item (experimental) RDFa via rdflib
+@item Dublin Core Metadata (DC-HTML-2003)
+@end itemize")
+    (license license:bsd-3)))