Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 50%

213 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-13 11:04 +0000

1""" 

2The `text` module contains the `Textbone` and a custom HTML-Parser 

3to validate and extract client data for the `TextBone`. 

4""" 

5import html 

6import string 

7import typing as t 

8import warnings 

9from html.parser import HTMLParser 

10from viur.core import db, conf, i18n 

11from .base import ReadFromClientError, ReadFromClientErrorSeverity 

12from .raw import RawBone 

13 

14 

15class HtmlBoneConfiguration(t.TypedDict): 

16 """A dictionary containing configurations for handling HTML content in TextBone instances.""" 

17 

18 validTags: list[str] 

19 """A list of valid HTML tags allowed in TextBone instances.""" 

20 

21 validAttrs: dict[str, list[str]] 

22 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes.""" 

23 

24 validStyles: list[str] 

25 """A list of allowed CSS directives for the TextBone instances.""" 

26 

27 validClasses: list[str] 

28 """A list of valid CSS class names allowed in TextBone instances.""" 

29 

30 singleTags: list[str] 

31 """A list of self-closing HTML tags that don't have corresponding end tags.""" 

32 

33 

34class CollectBlobKeys(HTMLParser): 

35 """ 

36 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute 

37 of <a> and <img> tags. 

38 """ 

39 

40 def __init__(self): 

41 super(CollectBlobKeys, self).__init__() 

42 self.blobs = set() 

43 

44 def handle_starttag(self, tag, attrs): 

45 """ 

46 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method 

47 extracts the blob key from the "src" attribute and adds it to the "blobs" set. 

48 

49 :param str tag: The current start tag encountered by the parser. 

50 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

51 """ 

52 if tag in ["a", "img"]: 

53 for k, v in attrs: 

54 if k == "src": 

55 file = getattr(conf.main_app.vi, "file", None) 

56 if file and (filepath := file.parse_download_url(v)): 

57 self.blobs.add(filepath.dlkey) 

58 

59 

60class HtmlSerializer(HTMLParser): 

61 """ 

62 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content 

63 by removing invalid tags and attributes while retaining the valid ones. 

64 

65 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes. 

66 :param dict srcSet: A dictionary containing width and height for srcset attribute processing. 

67 """ 

68 __html_serializer_trans = str.maketrans( 

69 {"<": "&lt;", 

70 ">": "&gt;", 

71 "\"": "&quot;", 

72 "'": "&#39;", 

73 "\n": "", 

74 "\0": ""}) 

75 

76 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True): 

77 super().__init__(convert_charrefs=convert_charrefs) 

78 self.result = "" # The final result that will be returned 

79 self.openTagsList = [] # List of tags that still need to be closed 

80 self.tagCache = [] # Tuple of tags that have been processed but not written yet 

81 self.validHtml = validHtml 

82 self.srcSet = srcSet 

83 

84 def handle_data(self, data): 

85 """ 

86 Handles the data encountered in the HTML content being parsed. Escapes special characters 

87 and appends the data to the result if it is not only whitespace characters. 

88 

89 :param str data: The data encountered by the parser. 

90 """ 

91 data = str(data).translate(HtmlSerializer.__html_serializer_trans) 

92 if data.strip(): 

93 self.flushCache() 

94 self.result += data 

95 

96 def handle_charref(self, name): 

97 """ 

98 Handles character references in the HTML content being parsed and appends the character reference to the 

99 result. 

100 

101 :param str name: The name of the character reference. 

102 """ 

103 self.flushCache() 

104 self.result += f"&#{name};" 

105 

106 def handle_entityref(self, name): # FIXME 

107 """ 

108 Handles entity references in the HTML content being parsed and appends the entity reference to the result. 

109 

110 :param str name: The name of the entity reference. 

111 """ 

112 if name in html.entities.entitydefs.keys(): 112 ↛ exitline 112 didn't return from function 'handle_entityref' because the condition on line 112 was always true

113 self.flushCache() 

114 self.result += f"&{name};" 

115 

116 def flushCache(self): 

117 """ 

118 Flush pending tags into the result and push their corresponding end-tags onto the stack 

119 """ 

120 for start, end in self.tagCache: 

121 self.result += start 

122 self.openTagsList.insert(0, end) 

123 self.tagCache = [] 

124 

125 def handle_starttag(self, tag, attrs): 

126 """ 

127 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and 

128 processes valid ones. 

129 

130 :param str tag: The current start tag encountered by the parser. 

131 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag. 

132 """ 

133 filterChars = "\"'\\\0\r\n@()" 

134 if self.validHtml and tag in self.validHtml["validTags"]: 

135 cacheTagStart = '<' + tag 

136 isBlankTarget = False 

137 styles = None 

138 classes = None 

139 for k, v in attrs: 

140 k = k.strip() 

141 v = v.strip() 

142 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]): 

143 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was never true

144 # If we have a title or href attribute, ignore @ and () 

145 pass 

146 else: 

147 # Either the key or the value contains a character that's not supposed to be there 

148 continue 

149 elif k == "class": 149 ↛ 151line 149 didn't jump to line 151 because the condition on line 149 was never true

150 # Classes are handled below 

151 classes = v.split(" ") 

152 continue 

153 elif k == "style": 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was never true

154 # Styles are handled below 

155 styles = v.split(";") 

156 continue 

157 elif k == "src": 157 ↛ 182line 157 didn't jump to line 182 because the condition on line 157 was always true

158 # We ensure that any src tag starts with an actual url 

159 checker = v.lower() 

160 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 continue 

162 

163 file = getattr(conf.main_app.vi, "file", None) 

164 if file and (filepath := file.parse_download_url(v)): 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 v = file.create_download_url( 

166 filepath.dlkey, 

167 filepath.filename, 

168 filepath.is_derived, 

169 expires=None 

170 ) 

171 

172 if self.srcSet: 

173 # Build the src set with files already available. If a derived file is not yet build, 

174 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards. 

175 srcSet = file.create_src_set( 

176 filepath.dlkey, 

177 None, 

178 self.srcSet.get("width"), 

179 self.srcSet.get("height") 

180 ) 

181 cacheTagStart += f' srcSet="{srcSet}"' 

182 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was never true

183 # That attribute is not valid on this tag 

184 continue 

185 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true

186 cacheTagStart += f' {k}="{v}"' 

187 if tag == "a" and k == "target" and v.lower() == "_blank": 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 isBlankTarget = True 

189 if styles: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 syleRes = {} 

191 for s in styles: 

192 style = s[: s.find(":")].strip() 

193 value = s[s.find(":") + 1:].strip() 

194 if any([c in style for c in filterChars]) or any( 

195 [c in value for c in filterChars]): 

196 # Either the key or the value contains a character that's not supposed to be there 

197 continue 

198 if value.lower().startswith("expression") or value.lower().startswith("import"): 

199 # IE evaluates JS inside styles if the keyword expression is present 

200 continue 

201 if style in self.validHtml["validStyles"] and not any( 

202 [(x in value) for x in ["\"", ":", ";"]]): 

203 syleRes[style] = value 

204 if len(syleRes.keys()): 

205 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\"""" 

206 if classes: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 validClasses = [] 

208 for currentClass in classes: 

209 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-" 

210 if not all([x in validClassChars for x in currentClass]): 

211 # The class contains invalid characters 

212 continue 

213 isOkay = False 

214 for validClass in self.validHtml["validClasses"]: 

215 # Check if the classname matches or is white-listed by a prefix 

216 if validClass == currentClass: 

217 isOkay = True 

218 break 

219 if validClass.endswith("*"): 

220 validClass = validClass[:-1] 

221 if currentClass.startswith(validClass): 

222 isOkay = True 

223 break 

224 if isOkay: 

225 validClasses.append(currentClass) 

226 if validClasses: 

227 cacheTagStart += f""" class=\"{" ".join(validClasses)}\"""" 

228 if isBlankTarget: 228 ↛ 230line 228 didn't jump to line 230 because the condition on line 228 was never true

229 # Add rel tag to prevent the browser to pass window.opener around 

230 cacheTagStart += " rel=\"noopener noreferrer\"" 

231 if tag in self.validHtml["singleTags"]: 

232 # Single-Tags do have a visual representation; ensure it makes it into the result 

233 self.flushCache() 

234 self.result += cacheTagStart + '>' # dont need slash in void elements in html5 

235 else: 

236 # We opened a 'normal' tag; push it on the cache so it can be discarded later if 

237 # we detect it has no content 

238 cacheTagStart += '>' 

239 self.tagCache.append((cacheTagStart, tag)) 

240 else: 

241 self.result += " " 

242 

243 def handle_endtag(self, tag): 

244 """ 

245 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones. 

246 

247 :param str tag: The current end tag encountered by the parser. 

248 """ 

249 if self.validHtml: 249 ↛ exitline 249 didn't return from function 'handle_endtag' because the condition on line 249 was always true

250 if self.tagCache: 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was never true

251 # Check if that element is still on the cache 

252 # and just silently drop the cache up to that point 

253 if tag in [x[1] for x in self.tagCache] + self.openTagsList: 

254 for tagCache in self.tagCache[::-1]: 

255 self.tagCache.remove(tagCache) 

256 if tagCache[1] == tag: 

257 return 

258 if tag in self.openTagsList: 

259 # Close all currently open Tags until we reach the current one. If no one is found, 

260 # we just close everything and ignore the tag that should have been closed 

261 for endTag in self.openTagsList[:]: 261 ↛ exitline 261 didn't return from function 'handle_endtag' because the loop on line 261 didn't complete

262 self.result += f"</{endTag}>" 

263 self.openTagsList.remove(endTag) 

264 if endTag == tag: 

265 break 

266 

267 def cleanup(self): # FIXME: vertauschte tags 

268 """ Append missing closing tags to the result.""" 

269 self.flushCache() 

270 for tag in self.openTagsList: 270 ↛ 271line 270 didn't jump to line 271 because the loop on line 270 never started

271 endTag = f'</{tag}>' 

272 self.result += endTag 

273 

274 def sanitize(self, instr): 

275 """ 

276 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones. 

277 

278 :param str instr: The input HTML string to be sanitized. 

279 :return: The sanitized HTML string. 

280 :rtype: str 

281 """ 

282 self.result = "" 

283 self.openTagsList = [] 

284 self.feed(instr) 

285 self.close() 

286 self.cleanup() 

287 return self.result 

288 

289 

290class TextBone(RawBone): 

291 """ 

292 A bone for storing and validating HTML or plain text content. Can be configured to allow 

293 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of 

294 srcset for embedded images. 

295 

296 :param validHtml: A dictionary containing allowed HTML tags and their attributes. 

297 Defaults to `conf.bone_html_default_allow`. 

298 :param max_length: The maximum allowed length for the content. Defaults to 200000. 

299 :param languages: If set, this bone can store a different content for each language 

300 :param srcSet: An optional dictionary containing width and height for srcset generation. 

301 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

302 :param indexed: Whether the content should be indexed for searching. Defaults to False. 

303 :param kwargs: Additional keyword arguments to be passed to the base class constructor. 

304 """ 

305 

306 class __undefinedC__: 

307 pass 

308 

309 type = "text" 

310 

311 def __init__( 

312 self, 

313 *, 

314 validHtml: None | HtmlBoneConfiguration = __undefinedC__, 

315 max_length: int = 200000, 

316 srcSet: t.Optional[dict[str, list]] = None, 

317 indexed: bool = False, 

318 **kwargs 

319 ): 

320 """ 

321 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow` 

322 :param languages: If set, this bone can store a different content for each language 

323 :param max_length: Limit content to max_length bytes 

324 :param indexed: Must not be set True, unless you limit max_length accordingly 

325 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of 

326 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]} 

327 """ 

328 # fixme: Remove in viur-core >= 4 

329 if "maxLength" in kwargs: 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true

330 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning) 

331 max_length = kwargs.pop("maxLength") 

332 super().__init__(indexed=indexed, **kwargs) 

333 

334 if validHtml == TextBone.__undefinedC__: 334 ↛ 337line 334 didn't jump to line 337 because the condition on line 334 was always true

335 validHtml = conf.bone_html_default_allow 

336 

337 self.validHtml = validHtml 

338 self.max_length = max_length 

339 self.srcSet = srcSet 

340 

341 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool): 

342 """ 

343 Serializes a single value of the TextBone instance for storage. 

344 

345 This method takes the value as-is without any additional processing, since it's already stored in a format 

346 suitable for serialization. 

347 """ 

348 return value 

349 

350 def singleValueFromClient(self, value, skel, bone_name, client_data): 

351 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise 

352 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None 

353 else: 

354 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)] 

355 

356 def getEmptyValue(self): 

357 """ 

358 Returns an empty value for the TextBone instance. 

359 

360 This method is used to represent an empty or unset value for the TextBone. 

361 

362 return: An empty string. 

363 :rtype: str 

364 """ 

365 return "" 

366 

367 def isInvalid(self, value): 

368 """ 

369 Checks if the given value is valid for this TextBone instance. 

370 

371 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not 

372 None and within the maximum length). 

373 

374 :param value: The value to be checked for validity. 

375 :return: Returns None if the value is valid, or an error message string otherwise. 

376 :rtype: Optional[str] 

377 """ 

378 

379 if value is None: 

380 return i18n.translate("core.bones.error.novalueentered", "No value entered") 

381 if len(value) > self.max_length: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 return i18n.translate("core.bones.error.maximumlengthexceeded", "Maximum length exceeded") 

383 

384 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]: 

385 """ 

386 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance. 

387 

388 This method parses the HTML content of the TextBone to identify embedded images or file hrefs, 

389 collects their blob keys, and ensures that they are not deleted even if removed from the file browser, 

390 preventing broken links or images in the TextBone content. 

391 

392 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

393 :param str name: The name of the TextBone for which to find referenced blobs. 

394 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content. 

395 :rtype: Set[str] 

396 """ 

397 

398 collector = CollectBlobKeys() 

399 

400 for idx, lang, value in self.iter_bone_value(skel, name): 

401 if value: 

402 collector.feed(value) 

403 

404 blob_keys = collector.blobs 

405 

406 if blob_keys and self.srcSet: 

407 derive_dict = { 

408 "thumbnail": [ 

409 {"width": x} for x in (self.srcSet.get("width") or []) 

410 ] + [ 

411 {"height": x} for x in (self.srcSet.get("height") or []) 

412 ] 

413 } 

414 from viur.core.bones.file import ensureDerived 

415 for blob_key in blob_keys: 

416 file_obj = db.Query("file").filter("dlkey =", blob_key) \ 

417 .order(("creationdate", db.SortOrder.Ascending)).getEntry() 

418 if file_obj: 

419 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"]) 

420 

421 return blob_keys 

422 

423 def refresh(self, skel, boneName) -> None: 

424 """ 

425 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary. 

426 

427 This method is useful when the src-set configuration has changed and needs to be applied 

428 to the existing HTML content. It re-parses the content and updates the src-set attributes 

429 accordingly. 

430 

431 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry. 

432 :param str boneName: The name of the TextBone for which to refresh the src-set. 

433 """ 

434 if self.srcSet: 

435 val = skel[boneName] 

436 if self.languages and isinstance(val, dict): 

437 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()} 

438 elif not self.languages and isinstance(val, str): 

439 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0] 

440 

441 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]: 

442 """ 

443 Retrieves the unique property index values for the TextBone. 

444 

445 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear 

446 whether each language should be kept distinct or not. Otherwise, it calls the superclass's 

447 getUniquePropertyIndexValues method to retrieve the unique property index values. 

448 

449 :param valuesCache: A dictionary containing the cached values for the TextBone. 

450 :param name: The name of the TextBone. 

451 :return: A list of unique property index values for the TextBone. 

452 :raises NotImplementedError: If the TextBone supports multiple languages. 

453 """ 

454 if self.languages: 

455 # Not yet implemented as it's unclear if we should keep each language distinct or not 

456 raise NotImplementedError() 

457 

458 return super().getUniquePropertyIndexValues(valuesCache, name) 

459 

460 def structure(self) -> dict: 

461 return super().structure() | { 

462 "valid_html": self.validHtml, 

463 }