LCOV - code coverage report
Current view: top level - lib/src/utils - html_to_text.dart (source / functions) Hit Total Coverage
Test: merged.info Lines: 127 129 98.4 %
Date: 2024-09-04 20:26:16 Functions: 0 0 -

          Line data    Source code
       1             : /*
       2             :  *   Famedly Matrix SDK
       3             :  *   Copyright (C) 2021 Famedly GmbH
       4             :  *
       5             :  *   This program is free software: you can redistribute it and/or modify
       6             :  *   it under the terms of the GNU Affero General Public License as
       7             :  *   published by the Free Software Foundation, either version 3 of the
       8             :  *   License, or (at your option) any later version.
       9             :  *
      10             :  *   This program is distributed in the hope that it will be useful,
      11             :  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
      12             :  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      13             :  *   GNU Affero General Public License for more details.
      14             :  *
      15             :  *   You should have received a copy of the GNU Affero General Public License
      16             :  *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
      17             :  */
      18             : 
      19             : import 'package:collection/collection.dart';
      20             : import 'package:html/dom.dart';
      21             : import 'package:html/parser.dart';
      22             : import 'package:html_unescape/html_unescape.dart';
      23             : 
      24             : class HtmlToText {
      25             :   /// Convert an HTML string to a pseudo-markdown plain text representation, with
      26             :   /// `data-mx-spoiler` spans redacted
      27           3 :   static String convert(String html) {
      28             :     // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
      29             :     // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
      30             :     // here already, to prevent that from happening.
      31             :     // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
      32             :     // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
      33             :     // strip it.
      34           3 :     final renderHtml = html.replaceAll(
      35           3 :         RegExp('<mx-reply>.*</mx-reply>',
      36             :             caseSensitive: false, multiLine: false, dotAll: true),
      37             :         '');
      38             : 
      39           3 :     final opts = _ConvertOpts();
      40           6 :     var reply = _walkNode(opts, parseFragment(renderHtml));
      41           6 :     reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
      42             :     return reply;
      43             :   }
      44             : 
      45           2 :   static String _parsePreContent(_ConvertOpts opts, Element node) {
      46           2 :     var text = node.innerHtml;
      47             :     final match =
      48           2 :         RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
      49           2 :             .firstMatch(text);
      50             :     if (match == null) {
      51           4 :       text = HtmlUnescape().convert(text);
      52           2 :       if (text.isNotEmpty) {
      53           4 :         if (text[0] != '\n') {
      54           2 :           text = '\n$text';
      55             :         }
      56           8 :         if (text[text.length - 1] != '\n') {
      57           2 :           text += '\n';
      58             :         }
      59             :       }
      60             :       return text;
      61             :     }
      62             :     // remove <code> opening tag
      63           4 :     text = text.substring(match.end);
      64             :     // remove the </code> closing tag
      65           2 :     text = text.replaceAll(
      66           2 :         RegExp(r'</code>$', multiLine: false, caseSensitive: false), '');
      67           4 :     text = HtmlUnescape().convert(text);
      68           2 :     if (text.isNotEmpty) {
      69           4 :       if (text[0] != '\n') {
      70           2 :         text = '\n$text';
      71             :       }
      72           8 :       if (text[text.length - 1] != '\n') {
      73           2 :         text += '\n';
      74             :       }
      75             :     }
      76             :     final language =
      77           2 :         RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
      78           4 :             .firstMatch(match.group(1)!);
      79             :     if (language != null) {
      80           4 :       text = language.group(1)! + text;
      81             :     }
      82             :     return text;
      83             :   }
      84             : 
      85           2 :   static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
      86           2 :     final msg = _walkChildNodes(opts, node);
      87          12 :     return '${msg.split('\n').map((s) => '> $s').join('\n')}\n';
      88             :   }
      89             : 
      90           2 :   static String _parseSpanContent(_ConvertOpts opts, Element node) {
      91           2 :     final content = _walkChildNodes(opts, node);
      92           6 :     if (node.attributes['data-mx-spoiler'] is String) {
      93           4 :       var spoiler = '█' * content.length;
      94           4 :       final reason = node.attributes['data-mx-spoiler'];
      95           2 :       if (reason != '') {
      96           2 :         spoiler = '($reason) $spoiler';
      97             :       }
      98             :       return spoiler;
      99             :     }
     100             :     return content;
     101             :   }
     102             : 
     103           2 :   static String _parseUlContent(_ConvertOpts opts, Element node) {
     104           4 :     opts.listDepth++;
     105           4 :     final entries = _listChildNodes(opts, node, {'li'});
     106           4 :     opts.listDepth--;
     107             :     final bulletPoint =
     108           8 :         _listBulletPoints[opts.listDepth % _listBulletPoints.length];
     109             : 
     110             :     return entries
     111           4 :         .map((s) =>
     112          14 :             '${'    ' * opts.listDepth}$bulletPoint ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}')
     113           2 :         .join('\n');
     114             :   }
     115             : 
     116           2 :   static String _parseOlContent(_ConvertOpts opts, Element node) {
     117           4 :     opts.listDepth++;
     118           4 :     final entries = _listChildNodes(opts, node, {'li'});
     119           4 :     opts.listDepth--;
     120           4 :     final startStr = node.attributes['start'];
     121           2 :     final start = (startStr is String &&
     122           4 :             RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
     123           2 :         ? int.parse(startStr)
     124             :         : 1;
     125             : 
     126             :     return entries
     127           4 :         .mapIndexed((index, s) =>
     128          16 :             '${'    ' * opts.listDepth}${start + index}. ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}')
     129           2 :         .join('\n');
     130             :   }
     131             : 
     132             :   static const _listBulletPoints = <String>['●', '○', '■', '‣'];
     133             : 
     134           2 :   static List<String> _listChildNodes(_ConvertOpts opts, Element node,
     135             :       [Iterable<String>? types]) {
     136           2 :     final replies = <String>[];
     137           4 :     for (final child in node.nodes) {
     138             :       if (types != null &&
     139           2 :           types.isNotEmpty &&
     140           2 :           ((child is Text) ||
     141           2 :               ((child is Element) &&
     142           6 :                   !types.contains(child.localName!.toLowerCase())))) {
     143             :         continue;
     144             :       }
     145           4 :       replies.add(_walkNode(opts, child));
     146             :     }
     147             :     return replies;
     148             :   }
     149             : 
     150             :   static const _blockTags = <String>{
     151             :     'blockquote',
     152             :     'ul',
     153             :     'ol',
     154             :     'h1',
     155             :     'h2',
     156             :     'h3',
     157             :     'h4',
     158             :     'h5',
     159             :     'h6',
     160             :     'pre',
     161             :   };
     162             : 
     163           3 :   static String _walkChildNodes(_ConvertOpts opts, Node node) {
     164             :     var reply = '';
     165             :     var lastTag = '';
     166           6 :     for (final child in node.nodes) {
     167           9 :       final thisTag = child is Element ? child.localName!.toLowerCase() : '';
     168           6 :       if (thisTag == 'p' && lastTag == 'p') {
     169           2 :         reply += '\n\n';
     170           3 :       } else if (_blockTags.contains(thisTag) &&
     171           3 :           reply.isNotEmpty &&
     172           8 :           reply[reply.length - 1] != '\n') {
     173           2 :         reply += '\n';
     174             :       }
     175           6 :       reply += _walkNode(opts, child);
     176           3 :       if (thisTag.isNotEmpty) {
     177             :         lastTag = thisTag;
     178             :       }
     179             :     }
     180             :     return reply;
     181             :   }
     182             : 
     183           3 :   static String _walkNode(_ConvertOpts opts, Node node) {
     184           3 :     if (node is Text) {
     185             :       // ignore \n between single nodes
     186           9 :       return node.text == '\n' ? '' : node.text;
     187           3 :     } else if (node is Element) {
     188           6 :       final tag = node.localName!.toLowerCase();
     189             :       switch (tag) {
     190           3 :         case 'em':
     191           3 :         case 'i':
     192           6 :           return '*${_walkChildNodes(opts, node)}*';
     193           3 :         case 'strong':
     194           3 :         case 'b':
     195           6 :           return '**${_walkChildNodes(opts, node)}**';
     196           3 :         case 'u':
     197           3 :         case 'ins':
     198           4 :           return '__${_walkChildNodes(opts, node)}__';
     199           3 :         case 'del':
     200           3 :         case 'strike':
     201           3 :         case 's':
     202           4 :           return '~~${_walkChildNodes(opts, node)}~~';
     203           3 :         case 'code':
     204           4 :           return '`${node.text}`';
     205           3 :         case 'pre':
     206           4 :           return '```${_parsePreContent(opts, node)}```\n';
     207           3 :         case 'a':
     208           6 :           final href = node.attributes['href'] ?? '';
     209           3 :           final content = _walkChildNodes(opts, node);
     210           6 :           if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
     211           6 :               href.toLowerCase().startsWith('matrix:')) {
     212             :             return content;
     213             :           }
     214           3 :           return '🔗$content';
     215           3 :         case 'img':
     216           4 :           return node.attributes['alt'] ??
     217           0 :               node.attributes['title'] ??
     218           0 :               node.attributes['src'] ??
     219             :               '';
     220           3 :         case 'br':
     221             :           return '\n';
     222           3 :         case 'blockquote':
     223           2 :           return _parseBlockquoteContent(opts, node);
     224           3 :         case 'ul':
     225           2 :           return _parseUlContent(opts, node);
     226           3 :         case 'ol':
     227           2 :           return _parseOlContent(opts, node);
     228           3 :         case 'mx-reply':
     229             :           return '';
     230           3 :         case 'hr':
     231             :           return '\n----------\n';
     232           3 :         case 'h1':
     233           3 :         case 'h2':
     234           3 :         case 'h3':
     235           3 :         case 'h4':
     236           3 :         case 'h5':
     237           3 :         case 'h6':
     238           9 :           final mark = '#' * int.parse(tag[1]);
     239           6 :           return '$mark ${_walkChildNodes(opts, node)}\n';
     240           3 :         case 'span':
     241           2 :           return _parseSpanContent(opts, node);
     242             :         default:
     243           3 :           return _walkChildNodes(opts, node);
     244             :       }
     245             :     } else {
     246           3 :       return _walkChildNodes(opts, node);
     247             :     }
     248             :   }
     249             : }
     250             : 
     251             : class _ConvertOpts {
     252             :   int listDepth = 0;
     253             : }

Generated by: LCOV version 1.14