Coverage for /home/runner/work/viur-core/viur-core/viur/src/viur/core/bones/text.py: 50%

1"""

2The `text` module contains the `Textbone` and a custom HTML-Parser

3to validate and extract client data for the `TextBone`.

4"""

5import html

6import string

7import typing as t

8import warnings

9from html.parser import HTMLParser

10from viur.core import db, conf, i18n

11from .base import ReadFromClientError, ReadFromClientErrorSeverity

12from .raw import RawBone

15class HtmlBoneConfiguration(t.TypedDict):

16 """A dictionary containing configurations for handling HTML content in TextBone instances."""

18 validTags: list[str]

19 """A list of valid HTML tags allowed in TextBone instances."""

21 validAttrs: dict[str, list[str]]

22 """A dictionary mapping valid attributes for each tag. If a tag is not listed, this tag accepts no attributes."""

24 validStyles: list[str]

25 """A list of allowed CSS directives for the TextBone instances."""

27 validClasses: list[str]

28 """A list of valid CSS class names allowed in TextBone instances."""

30 singleTags: list[str]

31 """A list of self-closing HTML tags that don't have corresponding end tags."""

34class CollectBlobKeys(HTMLParser):

35 """

36 A custom HTML parser that extends the HTMLParser class to collect blob keys found in the "src" attribute

37 of <a> and <img> tags.

38 """

40 def __init__(self):

41 super(CollectBlobKeys, self).__init__()

42 self.blobs = set()

44 def handle_starttag(self, tag, attrs):

45 """

46 Handles the start tag in the HTML content being parsed. If the tag is an <a> or <img> element, the method

47 extracts the blob key from the "src" attribute and adds it to the "blobs" set.

49 :param str tag: The current start tag encountered by the parser.

50 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.

51 """

52 if tag in ["a", "img"]:

53 for k, v in attrs:

54 if k == "src":

55 file = getattr(conf.main_app.vi, "file", None)

56 if file and (filepath := file.parse_download_url(v)):

57 self.blobs.add(filepath.dlkey)

60class HtmlSerializer(HTMLParser):

61 """

62 A custom HTML parser that extends the HTMLParser class to sanitize and serialize HTML content

63 by removing invalid tags and attributes while retaining the valid ones.

65 :param dict validHtml: A dictionary containing valid HTML tags, attributes, styles, and classes.

66 :param dict srcSet: A dictionary containing width and height for srcset attribute processing.

67 """

68 __html_serializer_trans = str.maketrans(

69 {"<": "<",

70 ">": ">",

71 "\"": """,

72 "'": "'",

73 "\n": "",

74 "\0": ""})

76 def __init__(self, validHtml: HtmlBoneConfiguration = None, srcSet=None, convert_charrefs: bool = True):

77 super().__init__(convert_charrefs=convert_charrefs)

78 self.result = "" # The final result that will be returned

79 self.openTagsList = [] # List of tags that still need to be closed

80 self.tagCache = [] # Tuple of tags that have been processed but not written yet

81 self.validHtml = validHtml

82 self.srcSet = srcSet

84 def handle_data(self, data):

85 """

86 Handles the data encountered in the HTML content being parsed. Escapes special characters

87 and appends the data to the result if it is not only whitespace characters.

89 :param str data: The data encountered by the parser.

90 """

91 data = str(data).translate(HtmlSerializer.__html_serializer_trans)

92 if data.strip():

93 self.flushCache()

94 self.result += data

96 def handle_charref(self, name):

97 """

98 Handles character references in the HTML content being parsed and appends the character reference to the

99 result.

100

101 :param str name: The name of the character reference.

102 """

103 self.flushCache()

104 self.result += f"&#{name};"

105

106 def handle_entityref(self, name): # FIXME

107 """

108 Handles entity references in the HTML content being parsed and appends the entity reference to the result.

109

110 :param str name: The name of the entity reference.

111 """

112 if name in html.entities.entitydefs.keys(): 112 ↛ exitline 112 didn't return from function 'handle_entityref' because the condition on line 112 was always true

113 self.flushCache()

114 self.result += f"&{name};"

115

116 def flushCache(self):

117 """

118 Flush pending tags into the result and push their corresponding end-tags onto the stack

119 """

120 for start, end in self.tagCache:

121 self.result += start

122 self.openTagsList.insert(0, end)

123 self.tagCache = []

124

125 def handle_starttag(self, tag, attrs):

126 """

127 Handles start tags in the HTML content being parsed. Filters out invalid tags and attributes and

128 processes valid ones.

129

130 :param str tag: The current start tag encountered by the parser.

131 :param List[Tuple[str, str]] attrs: A list of tuples containing the attribute name and value of the current tag.

132 """

133 filterChars = "\"'\\\0\r\n@()"

134 if self.validHtml and tag in self.validHtml["validTags"]:

135 cacheTagStart = '<' + tag

136 isBlankTarget = False

137 styles = None

138 classes = None

139 for k, v in attrs:

140 k = k.strip()

141 v = v.strip()

142 if any([c in k for c in filterChars]) or any([c in v for c in filterChars]):

143 if k in {"title", "href", "alt"} and not any([c in v for c in "\"'\\\0\r\n"]): 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was never true

144 # If we have a title or href attribute, ignore @ and ()

145 pass

146 else:

147 # Either the key or the value contains a character that's not supposed to be there

148 continue

149 elif k == "class": 149 ↛ 151line 149 didn't jump to line 151 because the condition on line 149 was never true

150 # Classes are handled below

151 classes = v.split(" ")

152 continue

153 elif k == "style": 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was never true

154 # Styles are handled below

155 styles = v.split(";")

156 continue

157 elif k == "src": 157 ↛ 182line 157 didn't jump to line 182 because the condition on line 157 was always true

158 # We ensure that any src tag starts with an actual url

159 checker = v.lower()

160 if not (checker.startswith("http://") or checker.startswith("https://") or checker.startswith("/")): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 continue

162

163 file = getattr(conf.main_app.vi, "file", None)

164 if file and (filepath := file.parse_download_url(v)): 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 v = file.create_download_url(

166 filepath.dlkey,

167 filepath.filename,

168 filepath.is_derived,

169 expires=None

170 )

171

172 if self.srcSet:

173 # Build the src set with files already available. If a derived file is not yet build,

174 # getReferencedBlobs will catch it, build it, and we're going to be re-called afterwards.

175 srcSet = file.create_src_set(

176 filepath.dlkey,

177 None,

178 self.srcSet.get("width"),

179 self.srcSet.get("height")

180 )

181 cacheTagStart += f' srcSet="{srcSet}"'

182 if not tag in self.validHtml["validAttrs"].keys() or not k in self.validHtml["validAttrs"][tag]: 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was never true

183 # That attribute is not valid on this tag

184 continue

185 if k.lower()[0:2] != 'on' and v.lower()[0:10] != 'javascript': 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true

186 cacheTagStart += f' {k}="{v}"'

187 if tag == "a" and k == "target" and v.lower() == "_blank": 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 isBlankTarget = True

189 if styles: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 syleRes = {}

191 for s in styles:

192 style = s[: s.find(":")].strip()

193 value = s[s.find(":") + 1:].strip()

194 if any([c in style for c in filterChars]) or any(

195 [c in value for c in filterChars]):

196 # Either the key or the value contains a character that's not supposed to be there

197 continue

198 if value.lower().startswith("expression") or value.lower().startswith("import"):

199 # IE evaluates JS inside styles if the keyword expression is present

200 continue

201 if style in self.validHtml["validStyles"] and not any(

202 [(x in value) for x in ["\"", ":", ";"]]):

203 syleRes[style] = value

204 if len(syleRes.keys()):

205 cacheTagStart += f""" style=\"{"; ".join([(f"{k}: {v}") for k, v in syleRes.items()])}\""""

206 if classes: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 validClasses = []

208 for currentClass in classes:

209 validClassChars = string.ascii_lowercase + string.ascii_uppercase + string.digits + "-"

210 if not all([x in validClassChars for x in currentClass]):

211 # The class contains invalid characters

212 continue

213 isOkay = False

214 for validClass in self.validHtml["validClasses"]:

215 # Check if the classname matches or is white-listed by a prefix

216 if validClass == currentClass:

217 isOkay = True

218 break

219 if validClass.endswith("*"):

220 validClass = validClass[:-1]

221 if currentClass.startswith(validClass):

222 isOkay = True

223 break

224 if isOkay:

225 validClasses.append(currentClass)

226 if validClasses:

227 cacheTagStart += f""" class=\"{" ".join(validClasses)}\""""

228 if isBlankTarget: 228 ↛ 230line 228 didn't jump to line 230 because the condition on line 228 was never true

229 # Add rel tag to prevent the browser to pass window.opener around

230 cacheTagStart += " rel=\"noopener noreferrer\""

231 if tag in self.validHtml["singleTags"]:

232 # Single-Tags do have a visual representation; ensure it makes it into the result

233 self.flushCache()

234 self.result += cacheTagStart + '>' # dont need slash in void elements in html5

235 else:

236 # We opened a 'normal' tag; push it on the cache so it can be discarded later if

237 # we detect it has no content

238 cacheTagStart += '>'

239 self.tagCache.append((cacheTagStart, tag))

240 else:

241 self.result += " "

242

243 def handle_endtag(self, tag):

244 """

245 Handles end tags in the HTML content being parsed. Closes open tags and discards invalid ones.

246

247 :param str tag: The current end tag encountered by the parser.

248 """

249 if self.validHtml: 249 ↛ exitline 249 didn't return from function 'handle_endtag' because the condition on line 249 was always true

250 if self.tagCache: 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was never true

251 # Check if that element is still on the cache

252 # and just silently drop the cache up to that point

253 if tag in [x[1] for x in self.tagCache] + self.openTagsList:

254 for tagCache in self.tagCache[::-1]:

255 self.tagCache.remove(tagCache)

256 if tagCache[1] == tag:

257 return

258 if tag in self.openTagsList:

259 # Close all currently open Tags until we reach the current one. If no one is found,

260 # we just close everything and ignore the tag that should have been closed

261 for endTag in self.openTagsList[:]: 261 ↛ exitline 261 didn't return from function 'handle_endtag' because the loop on line 261 didn't complete

262 self.result += f"</{endTag}>"

263 self.openTagsList.remove(endTag)

264 if endTag == tag:

265 break

266

267 def cleanup(self): # FIXME: vertauschte tags

268 """ Append missing closing tags to the result."""

269 self.flushCache()

270 for tag in self.openTagsList: 270 ↛ 271line 270 didn't jump to line 271 because the loop on line 270 never started

271 endTag = f'</{tag}>'

272 self.result += endTag

273

274 def sanitize(self, instr):

275 """

276 Sanitizes the input HTML string by removing invalid tags and attributes while retaining valid ones.

277

278 :param str instr: The input HTML string to be sanitized.

279 :return: The sanitized HTML string.

280 :rtype: str

281 """

282 self.result = ""

283 self.openTagsList = []

284 self.feed(instr)

285 self.close()

286 self.cleanup()

287 return self.result

288

289

290class TextBone(RawBone):

291 """

292 A bone for storing and validating HTML or plain text content. Can be configured to allow

293 only specific HTML tags and attributes, and enforce a maximum length. Supports the use of

294 srcset for embedded images.

295

296 :param validHtml: A dictionary containing allowed HTML tags and their attributes.

297 Defaults to `conf.bone_html_default_allow`.

298 :param max_length: The maximum allowed length for the content. Defaults to 200000.

299 :param languages: If set, this bone can store a different content for each language

300 :param srcSet: An optional dictionary containing width and height for srcset generation.

301 Must be a dict of "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}

302 :param indexed: Whether the content should be indexed for searching. Defaults to False.

303 :param kwargs: Additional keyword arguments to be passed to the base class constructor.

304 """

305

306 class __undefinedC__:

307 pass

308

309 type = "text"

310

311 def __init__(

312 self,

313 *,

314 validHtml: None | HtmlBoneConfiguration = __undefinedC__,

315 max_length: int = 200000,

316 srcSet: t.Optional[dict[str, list]] = None,

317 indexed: bool = False,

318 **kwargs

319 ):

320 """

321 :param validHtml: If set, must be a structure like `conf.bone_html_default_allow`

322 :param languages: If set, this bone can store a different content for each language

323 :param max_length: Limit content to max_length bytes

324 :param indexed: Must not be set True, unless you limit max_length accordingly

325 :param srcSet: If set, inject srcset tags to embedded images. Must be a dict of

326 "width": [List of Ints], "height": [List of Ints], eg {"height": [720, 1080]}

327 """

328 # fixme: Remove in viur-core >= 4

329 if "maxLength" in kwargs: 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true

330 warnings.warn("maxLength parameter is deprecated, please use max_length", DeprecationWarning)

331 max_length = kwargs.pop("maxLength")

332 super().__init__(indexed=indexed, **kwargs)

333

334 if validHtml == TextBone.__undefinedC__: 334 ↛ 337line 334 didn't jump to line 337 because the condition on line 334 was always true

335 validHtml = conf.bone_html_default_allow

336

337 self.validHtml = validHtml

338 self.max_length = max_length

339 self.srcSet = srcSet

340

341 def singleValueSerialize(self, value, skel: 'SkeletonInstance', name: str, parentIndexed: bool):

342 """

343 Serializes a single value of the TextBone instance for storage.

344

345 This method takes the value as-is without any additional processing, since it's already stored in a format

346 suitable for serialization.

347 """

348 return value

349

350 def singleValueFromClient(self, value, skel, bone_name, client_data):

351 if not (err := self.isInvalid(value)): # Returns None on success, error-str otherwise

352 return HtmlSerializer(self.validHtml, self.srcSet, False).sanitize(value), None

353 else:

354 return self.getEmptyValue(), [ReadFromClientError(ReadFromClientErrorSeverity.Invalid, err)]

355

356 def getEmptyValue(self):

357 """

358 Returns an empty value for the TextBone instance.

359

360 This method is used to represent an empty or unset value for the TextBone.

361

362 return: An empty string.

363 :rtype: str

364 """

365 return ""

366

367 def isInvalid(self, value):

368 """

369 Checks if the given value is valid for this TextBone instance.

370

371 This method checks whether the given value is valid according to the TextBone's constraints (e.g., not

372 None and within the maximum length).

373

374 :param value: The value to be checked for validity.

375 :return: Returns None if the value is valid, or an error message string otherwise.

376 :rtype: Optional[str]

377 """

378

379 if value is None:

380 return i18n.translate("core.bones.error.novalueentered", "No value entered")

381 if len(value) > self.max_length: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 return i18n.translate("core.bones.error.maximumlengthexceeded", "Maximum length exceeded")

383

384 def getReferencedBlobs(self, skel: 'viur.core.skeleton.SkeletonInstance', name: str) -> set[str]:

385 """

386 Extracts and returns the blob keys of referenced files in the HTML content of the TextBone instance.

387

388 This method parses the HTML content of the TextBone to identify embedded images or file hrefs,

389 collects their blob keys, and ensures that they are not deleted even if removed from the file browser,

390 preventing broken links or images in the TextBone content.

391

392 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.

393 :param str name: The name of the TextBone for which to find referenced blobs.

394 :return: A set containing the blob keys of the referenced files in the TextBone's HTML content.

395 :rtype: Set[str]

396 """

397

398 collector = CollectBlobKeys()

399

400 for idx, lang, value in self.iter_bone_value(skel, name):

401 if value:

402 collector.feed(value)

403

404 blob_keys = collector.blobs

405

406 if blob_keys and self.srcSet:

407 derive_dict = {

408 "thumbnail": [

409 {"width": x} for x in (self.srcSet.get("width") or [])

410 ] + [

411 {"height": x} for x in (self.srcSet.get("height") or [])

412 ]

413 }

414 from viur.core.bones.file import ensureDerived

415 for blob_key in blob_keys:

416 file_obj = db.Query("file").filter("dlkey =", blob_key) \

417 .order(("creationdate", db.SortOrder.Ascending)).getEntry()

418 if file_obj:

419 ensureDerived(file_obj.key, f"{skel.kindName}_{name}", derive_dict, skel["key"])

420

421 return blob_keys

422

423 def refresh(self, skel, boneName) -> None:

424 """

425 Re-parses the text content of the TextBone instance to rebuild the src-set if necessary.

426

427 This method is useful when the src-set configuration has changed and needs to be applied

428 to the existing HTML content. It re-parses the content and updates the src-set attributes

429 accordingly.

430

431 :param SkeletonInstance skel: A SkeletonInstance object containing the data of an entry.

432 :param str boneName: The name of the TextBone for which to refresh the src-set.

433 """

434 if self.srcSet:

435 val = skel[boneName]

436 if self.languages and isinstance(val, dict):

437 skel[boneName] = {k: self.singleValueFromClient(v, skel, boneName, None)[0] for k, v in val.items()}

438 elif not self.languages and isinstance(val, str):

439 skel[boneName] = self.singleValueFromClient(val, skel, boneName, None)[0]

440

441 def getUniquePropertyIndexValues(self, valuesCache: dict, name: str) -> list[str]:

442 """

443 Retrieves the unique property index values for the TextBone.

444

445 If the TextBone supports multiple languages, this method raises a NotImplementedError, as it's unclear

446 whether each language should be kept distinct or not. Otherwise, it calls the superclass's

447 getUniquePropertyIndexValues method to retrieve the unique property index values.

448

449 :param valuesCache: A dictionary containing the cached values for the TextBone.

450 :param name: The name of the TextBone.

451 :return: A list of unique property index values for the TextBone.

452 :raises NotImplementedError: If the TextBone supports multiple languages.

453 """

454 if self.languages:

455 # Not yet implemented as it's unclear if we should keep each language distinct or not

456 raise NotImplementedError()

457

458 return super().getUniquePropertyIndexValues(valuesCache, name)

459

460 def structure(self) -> dict:

461 return super().structure() | {

462 "valid_html": self.validHtml,

463 }

Coverage for / home / runner / work / viur-core / viur-core / viur / src / viur / core / bones / text.py: 50%

213 statements