Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 50%
213 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-13 11:04 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-13 11:04 +0000
1"""
2The `text` module contains the `Textbone` and a custom HTML-Parser
3to validate and extract client data for the `TextBone`.
4"""
5import html
6import string
7import typing as t
8import warnings
9from html.parser import HTMLParser
10from viur.core import db, conf, i18n
11from .base import ReadFromClientError, ReadFromClientErrorSeverity
12from .raw import RawBone
15class HtmlBoneConfiguration(t.TypedDict):
16 """A dictionary containing configurations for handling HTML content in TextBone instances."""
18 validTags: list[str]
19 """A list of valid HTML tags allowed in TextBone instances."""
21 validAttrs: dict[str, list[str]]
22 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes."""
24 validStyles: list[str]
25 """A list of allowed CSS directives for the TextBone instances."""
27 validClasses: list[str]
28 """A list of valid CSS class names allowed in TextBone instances."""
30 singleTags: list[str]
31 """A list of self-closing HTML tags that don't have corresponding end tags."""
34class CollectBlobKeys(HTMLParser):
35 """
36 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute
37 of <a> and <img> tags.
38 """
40 def __init__(self):
41 super(CollectBlobKeys, self).__init__()
42 self.blobs = set()
44 def handle_starttag(self, tag, attrs):
45 """
46 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method
47 extracts the blob key from the "src" attribute and adds it to the "blobs" set.
49 :param str tag: The current start tag encountered by the parser.
50 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
51 """
52 if tag in ["a", "img"]:
53 for k, v in attrs:
54 if k == "src":
55 file = getattr(conf.main_app.vi, "file", None)
56 if file and (filepath := file.parse_download_url(v)):
57 self.blobs.add(filepath.dlkey)
60class HtmlSerializer(HTMLParser):
61 """
62 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content
63 by removing invalid tags and attributes while retaining the valid ones.
65 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes.
66 :param dict srcSet: A dictionary containing width and height for srcset attribute processing.
67 """
68 __html_serializer_trans = str.maketrans(
69 {"<": "<",
70 ">": ">",
71 "\"": """,
72 "'": "'",
73 "\n": "",
74 "\0": ""})
76 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True):
77 super().__init__(convert_charrefs=convert_charrefs)
78 self.result = "" # The final result that will be returned
79 self.openTagsList = [] # List of tags that still need to be closed
80 self.tagCache = [] # Tuple of tags that have been processed but not written yet
81 self.validHtml = validHtml
82 self.srcSet = srcSet
84 def handle_data(self, data):
85 """
86 Handles the data encountered in the HTML content being parsed. Escapes special characters
87 and appends the data to the result if it is not only whitespace characters.
89 :param str data: The data encountered by the parser.
90 """
91 data = str(data).translate(HtmlSerializer.__html_serializer_trans)
92 if data.strip():
93 self.flushCache()
94 self.result += data
96 def handle_charref(self, name):
97 """
98 Handles character references in the HTML content being parsed and appends the character reference to the
99 result.
101 :param str name: The name of the character reference.
102 """
103 self.flushCache()
104 self.result += f"&#{name};"
106 def handle_entityref(self, name): # FIXME
107 """
108 Handles entity references in the HTML content being parsed and appends the entity reference to the result.
110 :param str name: The name of the entity reference.
111 """
112 if name in html.entities.entitydefs.keys(): 112 ↛ exitline 112 didn't return from function 'handle_entityref' because the condition on line 112 was always true
113 self.flushCache()
114 self.result += f"&{name};"
116 def flushCache(self):
117 """
118 Flush pending tags into the result and push their corresponding end-tags onto the stack
119 """
120 for start, end in self.tagCache:
121 self.result += start
122 self.openTagsList.insert(0, end)
123 self.tagCache = []
125 def handle_starttag(self, tag, attrs):
126 """
127 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and
128 processes valid ones.
130 :param str tag: The current start tag encountered by the parser.
131 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.
132 """
133 filterChars = "\"'\\\0\r\n@()"
134 if self.validHtml and tag in self.validHtml["validTags"]:
135 cacheTagStart = '<' + tag
136 isBlankTarget = False
137 styles = None
138 classes = None
139 for k, v in attrs:
140 k = k.strip()
141 v = v.strip()
142 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):
143 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was never true
144 # If we have a title or href attribute, ignore @ and ()
145 pass
146 else:
147 # Either the key or the value contains a character that's not supposed to be there
148 continue
149 elif k == "class": 149 ↛ 151line 149 didn't jump to line 151 because the condition on line 149 was never true
150 # Classes are handled below
151 classes = v.split(" ")
152 continue
153 elif k == "style": 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was never true
154 # Styles are handled below
155 styles = v.split(";")
156 continue
157 elif k == "src": 157 ↛ 182line 157 didn't jump to line 182 because the condition on line 157 was always true
158 # We ensure that any src tag starts with an actual url
159 checker = v.lower()
160 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 continue
163 file = getattr(conf.main_app.vi, "file", None)
164 if file and (filepath := file.parse_download_url(v)): 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true
165 v = file.create_download_url(
166 filepath.dlkey,
167 filepath.filename,
168 filepath.is_derived,
169 expires=None
170 )
172 if self.srcSet:
173 # Build the src set with files already available. If a derived file is not yet build,
174 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards.
175 srcSet = file.create_src_set(
176 filepath.dlkey,
177 None,
178 self.srcSet.get("width"),
179 self.srcSet.get("height")
180 )
181 cacheTagStart += f' srcSet="{srcSet}"'
182 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was never true
183 # That attribute is not valid on this tag
184 continue
185 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true
186 cacheTagStart += f' {k}="{v}"'
187 if tag == "a" and k == "target" and v.lower() == "_blank": 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 isBlankTarget = True
189 if styles: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 syleRes = {}
191 for s in styles:
192 style = s[: s.find(":")].strip()
193 value = s[s.find(":") + 1:].strip()
194 if any([c in style for c in filterChars]) or any(
195 [c in value for c in filterChars]):
196 # Either the key or the value contains a character that's not supposed to be there
197 continue
198 if value.lower().startswith("expression") or value.lower().startswith("import"):
199 # IE evaluates JS inside styles if the keyword expression is present
200 continue
201 if style in self.validHtml["validStyles"] and not any(
202 [(x in value) for x in ["\"", ":", ";"]]):
203 syleRes[style] = value
204 if len(syleRes.keys()):
205 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\""""
206 if classes: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 validClasses = []
208 for currentClass in classes:
209 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"
210 if not all([x in validClassChars for x in currentClass]):
211 # The class contains invalid characters
212 continue
213 isOkay = False
214 for validClass in self.validHtml["validClasses"]:
215 # Check if the classname matches or is white-listed by a prefix
216 if validClass == currentClass:
217 isOkay = True
218 break
219 if validClass.endswith("*"):
220 validClass = validClass[:-1]
221 if currentClass.startswith(validClass):
222 isOkay = True
223 break
224 if isOkay:
225 validClasses.append(currentClass)
226 if validClasses:
227 cacheTagStart += f""" class=\"{" ".join(validClasses)}\""""
228 if isBlankTarget: 228 ↛ 230line 228 didn't jump to line 230 because the condition on line 228 was never true
229 # Add rel tag to prevent the browser to pass window.opener around
230 cacheTagStart += " rel=\"noopener noreferrer\""
231 if tag in self.validHtml["singleTags"]:
232 # Single-Tags do have a visual representation; ensure it makes it into the result
233 self.flushCache()
234 self.result += cacheTagStart + '>' # dont need slash in void elements in html5
235 else:
236 # We opened a 'normal' tag; push it on the cache so it can be discarded later if
237 # we detect it has no content
238 cacheTagStart += '>'
239 self.tagCache.append((cacheTagStart, tag))
240 else:
241 self.result += " "
243 def handle_endtag(self, tag):
244 """
245 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones.
247 :param str tag: The current end tag encountered by the parser.
248 """
249 if self.validHtml: 249 ↛ exitline 249 didn't return from function 'handle_endtag' because the condition on line 249 was always true
250 if self.tagCache: 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was never true
251 # Check if that element is still on the cache
252 # and just silently drop the cache up to that point
253 if tag in [x[1] for x in self.tagCache] + self.openTagsList:
254 for tagCache in self.tagCache[::-1]:
255 self.tagCache.remove(tagCache)
256 if tagCache[1] == tag:
257 return
258 if tag in self.openTagsList:
259 # Close all currently open Tags until we reach the current one. If no one is found,
260 # we just close everything and ignore the tag that should have been closed
261 for endTag in self.openTagsList[:]: 261 ↛ exitline 261 didn't return from function 'handle_endtag' because the loop on line 261 didn't complete
262 self.result += f"</{endTag}>"
263 self.openTagsList.remove(endTag)
264 if endTag == tag:
265 break
267 def cleanup(self): # FIXME: vertauschte tags
268 """ Append missing closing tags to the result."""
269 self.flushCache()
270 for tag in self.openTagsList: 270 ↛ 271line 270 didn't jump to line 271 because the loop on line 270 never started
271 endTag = f'</{tag}>'
272 self.result += endTag
274 def sanitize(self, instr):
275 """
276 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones.
278 :param str instr: The input HTML string to be sanitized.
279 :return: The sanitized HTML string.
280 :rtype: str
281 """
282 self.result = ""
283 self.openTagsList = []
284 self.feed(instr)
285 self.close()
286 self.cleanup()
287 return self.result
290class TextBone(RawBone):
291 """
292 A bone for storing and validating HTML or plain text content. Can be configured to allow
293 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of
294 srcset for embedded images.
296 :param validHtml: A dictionary containing allowed HTML tags and their attributes.
297 Defaults to `conf.bone_html_default_allow`.
298 :param max_length: The maximum allowed length for the content. Defaults to 200000.
299 :param languages: If set, this bone can store a different content for each language
300 :param srcSet: An optional dictionary containing width and height for srcset generation.
301 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
302 :param indexed: Whether the content should be indexed for searching. Defaults to False.
303 :param kwargs: Additional keyword arguments to be passed to the base class constructor.
304 """
306 class __undefinedC__:
307 pass
309 type = "text"
311 def __init__(
312 self,
313 *,
314 validHtml: None | HtmlBoneConfiguration = __undefinedC__,
315 max_length: int = 200000,
316 srcSet: t.Optional[dict[str, list]] = None,
317 indexed: bool = False,
318 **kwargs
319 ):
320 """
321 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow`
322 :param languages: If set, this bone can store a different content for each language
323 :param max_length: Limit content to max_length bytes
324 :param indexed: Must not be set True, unless you limit max_length accordingly
325 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of
326 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}
327 """
328 # fixme: Remove in viur-core >= 4
329 if "maxLength" in kwargs: 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true
330 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning)
331 max_length = kwargs.pop("maxLength")
332 super().__init__(indexed=indexed, **kwargs)
334 if validHtml == TextBone.__undefinedC__: 334 ↛ 337line 334 didn't jump to line 337 because the condition on line 334 was always true
335 validHtml = conf.bone_html_default_allow
337 self.validHtml = validHtml
338 self.max_length = max_length
339 self.srcSet = srcSet
341 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool):
342 """
343 Serializes a single value of the TextBone instance for storage.
345 This method takes the value as-is without any additional processing, since it's already stored in a format
346 suitable for serialization.
347 """
348 return value
350 def singleValueFromClient(self, value, skel, bone_name, client_data):
351 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise
352 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None
353 else:
354 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)]
356 def getEmptyValue(self):
357 """
358 Returns an empty value for the TextBone instance.
360 This method is used to represent an empty or unset value for the TextBone.
362 return: An empty string.
363 :rtype: str
364 """
365 return ""
367 def isInvalid(self, value):
368 """
369 Checks if the given value is valid for this TextBone instance.
371 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not
372 None and within the maximum length).
374 :param value: The value to be checked for validity.
375 :return: Returns None if the value is valid, or an error message string otherwise.
376 :rtype: Optional[str]
377 """
379 if value is None:
380 return i18n.translate("core.bones.error.novalueentered", "No value entered")
381 if len(value) > self.max_length: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 return i18n.translate("core.bones.error.maximumlengthexceeded", "Maximum length exceeded")
384 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:
385 """
386 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance.
388 This method parses the HTML content of the TextBone to identify embedded images or file hrefs,
389 collects their blob keys, and ensures that they are not deleted even if removed from the file browser,
390 preventing broken links or images in the TextBone content.
392 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
393 :param str name: The name of the TextBone for which to find referenced blobs.
394 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content.
395 :rtype: Set[str]
396 """
398 collector = CollectBlobKeys()
400 for idx, lang, value in self.iter_bone_value(skel, name):
401 if value:
402 collector.feed(value)
404 blob_keys = collector.blobs
406 if blob_keys and self.srcSet:
407 derive_dict = {
408 "thumbnail": [
409 {"width": x} for x in (self.srcSet.get("width") or [])
410 ] + [
411 {"height": x} for x in (self.srcSet.get("height") or [])
412 ]
413 }
414 from viur.core.bones.file import ensureDerived
415 for blob_key in blob_keys:
416 file_obj = db.Query("file").filter("dlkey =", blob_key) \
417 .order(("creationdate", db.SortOrder.Ascending)).getEntry()
418 if file_obj:
419 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"])
421 return blob_keys
423 def refresh(self, skel, boneName) -> None:
424 """
425 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary.
427 This method is useful when the src-set configuration has changed and needs to be applied
428 to the existing HTML content. It re-parses the content and updates the src-set attributes
429 accordingly.
431 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.
432 :param str boneName: The name of the TextBone for which to refresh the src-set.
433 """
434 if self.srcSet:
435 val = skel[boneName]
436 if self.languages and isinstance(val, dict):
437 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()}
438 elif not self.languages and isinstance(val, str):
439 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0]
441 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]:
442 """
443 Retrieves the unique property index values for the TextBone.
445 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear
446 whether each language should be kept distinct or not. Otherwise, it calls the superclass's
447 getUniquePropertyIndexValues method to retrieve the unique property index values.
449 :param valuesCache: A dictionary containing the cached values for the TextBone.
450 :param name: The name of the TextBone.
451 :return: A list of unique property index values for the TextBone.
452 :raises NotImplementedError: If the TextBone supports multiple languages.
453 """
454 if self.languages:
455 # Not yet implemented as it's unclear if we should keep each language distinct or not
456 raise NotImplementedError()
458 return super().getUniquePropertyIndexValues(valuesCache, name)
460 def structure(self) -> dict:
461 return super().structure() | {
462 "valid_html": self.validHtml,
463 }