Fix source positions for inlines

Sandra Tatarevićová · Sandra Tatarevićová · commit 8e1a5b8dfd84 · 2020-04-08T18:26:02.000+02:00
Applied patch from commonmark/cmark#298
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "ext/commonmarker/cmark-upstream"]
 	path = ext/commonmarker/cmark-upstream
-	url = https://github.com/github/cmark-gfm.git
+	url = https://github.com/orchitech/cmark-gfm.git
 	ignore = dirty
diff --git a/ext/commonmarker/blocks.c b/ext/commonmarker/blocks.c
@@ -207,8 +207,37 @@ static void add_line(cmark_node *node, cmark_chunk *ch, cmark_parser *parser) {
       cmark_strbuf_putc(&node->content, ' ');
     }
   }
+
+  // If inserting the initial line to the node...
+  if (node->content.size == 0
+    // OR the node is a code block...
+    || node->type == CMARK_NODE_CODE_BLOCK
+    // OR the node is a HTML block.
+    || node->type == CMARK_NODE_HTML_BLOCK) {
+
+    // Then do not insert the leading trivia.
   cmark_strbuf_put(&node->content, ch->data + parser->offset,
                    ch->len - parser->offset);
+  } else {
+    // Special case for maintaining the source position of block quotes
+    // as they can be lazy (i.e. the block quote marker can be omitted).
+    //
+    // The simple solution is to replace any block quote markers (">")
+    // present in the leading trivia with whitespace.
+    //
+    // Note: Using `parser->offset` and not `parser->first_nonspace`
+    // because the latter encompasses the former with the addition of
+    // whitespace (which we are not interested in).
+    assert(parser->offset <= parser->first_nonspace);
+    for (int i = 0; i < parser->offset; i++) {
+      if (peek_at(ch, i) == '>')
+        ch->data[i] = ' ';
+    }
+
+    // Otherwise, do not remove leading trivia for appends (i.e. lines
+    // other than the first).
+    cmark_strbuf_put(&node->content, ch->data, ch->len);
+  }
 }
 
 static void remove_trailing_blank_lines(cmark_strbuf *ln) {
@@ -266,6 +295,12 @@ static bool resolve_reference_link_definitions(
 
     chunk.data += pos;
     chunk.len -= pos;
+
+    // Leading whitespace is not stripped.
+    while (cmark_isspace(peek_at(&chunk, 0))) {
+      chunk.data += 1;
+      chunk.len -= 1;
+    }
   }
   cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
   return !is_blank(&b->content, 0);
@@ -283,13 +318,33 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
          CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
   b->flags &= ~CMARK_NODE__OPEN;
 
-  if (parser->curline.size == 0) {
+  if (S_type(b) == CMARK_NODE_THEMATIC_BREAK) {
+    // Already been "finalized".
+    return parent;
+  }
+
+  if (S_type(b) == CMARK_NODE_HEADING && !b->as.heading.setext) {
+    parser->last_line_length += b->end_column;
+  }
+
+  if ((S_type(b) == CMARK_NODE_ITEM || S_type(b) == CMARK_NODE_LIST)
+      && b->last_child) {
+    b->end_line = b->last_child->end_line;
+    b->end_column = b->last_child->end_column;
+
+    if (S_type(b) == CMARK_NODE_ITEM && b->parent) {
+      // The finalization order is not deterministic...
+      b->parent->end_line = b->end_line;
+      b->parent->end_column = b->end_column;
+    }
+  } else if (parser->curline.size == 0) {
     // end of input - line number has not been incremented
     b->end_line = parser->line_number;
     b->end_column = parser->last_line_length;
   } else if (S_type(b) == CMARK_NODE_DOCUMENT ||
              (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
-             (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
+             (S_type(b) == CMARK_NODE_HTML_BLOCK
+              && b->end_line == b->start_line && b->end_column == 0)) {
     b->end_line = parser->line_number;
     b->end_column = parser->curline.size;
     if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
@@ -1181,6 +1236,10 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
       // it's only now that we know the line is not part of a setext heading:
       *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
                              parser->first_nonspace + 1);
+      // A thematic break can only be on a single line, so we can set the
+      // end source position here.
+      (*container)->end_line = parser->line_number;
+      (*container)->end_column = input->len - 1;
       S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
     } else if (!indented &&
                parser->options & CMARK_OPT_FOOTNOTES &&
@@ -1342,6 +1401,12 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
   } else { // not a lazy continuation
     // Finalize any blocks that were not matched and set cur to container:
     while (parser->current != last_matched_container) {
+      if (S_type(parser->current) == CMARK_NODE_HTML_BLOCK) {
+        // Edge case: Closing an HTML block without a matching end condition.
+        parser->current->end_line = parser->line_number - 1;
+        parser->current->end_column = parser->last_line_length;
+      }
+
       parser->current = finalize(parser, parser->current);
       assert(parser->current != NULL);
     }
@@ -1392,7 +1457,10 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
     } else if (accepts_lines(S_type(container))) {
       if (S_type(container) == CMARK_NODE_HEADING &&
           container->as.heading.setext == false) {
+        bufsize_t original_len = input->len;
         chop_trailing_hashtags(input);
+        // Substract one to exclude the trailing newline.
+        container->end_column += original_len - input->len - 1;
       }
       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
                        false);
diff --git a/ext/commonmarker/buffer.c b/ext/commonmarker/buffer.c
@@ -242,6 +242,11 @@ void cmark_strbuf_trim(cmark_strbuf *buf) {
   cmark_strbuf_rtrim(buf);
 }
 
+void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len) {
+  memmove(buf->ptr + start_offset, buf->ptr + start_offset + len, buf->size - (start_offset + len));
+  buf->size -= len;
+}
+
 // Destructively modify string, collapsing consecutive
 // space and newline characters into a single space.
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s) {
diff --git a/ext/commonmarker/buffer.h b/ext/commonmarker/buffer.h
@@ -103,6 +103,16 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf);
 CMARK_GFM_EXPORT
 void cmark_strbuf_trim(cmark_strbuf *buf);
 
+/**
+ Removes the characters in the given range.
+
+ @param buf The string buffer.
+ @param start_offset The starting character offset.
+ @param len The length of characters to remove.
+ */
+CMARK_GFM_EXPORT
+void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len);
+
 CMARK_GFM_EXPORT
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);
 
diff --git a/ext/commonmarker/cmark-upstream b/ext/commonmarker/cmark-upstream
@@ -1 +1 @@
-Subproject commit 36c1553d2a1f04dc1628e76b18490edeff78b8d0
+Subproject commit bf68304bd726866631923830279c93d47fd29507
diff --git a/ext/commonmarker/inlines.c b/ext/commonmarker/inlines.c
@@ -150,8 +150,8 @@ static CMARK_INLINE cmark_node *make_autolink(subject *subj,
   link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
   link->as.link.title = cmark_chunk_literal("");
   link->start_line = link->end_line = subj->line;
-  link->start_column = start_column + 1;
-  link->end_column = end_column + 1;
+  link->start_column = subj->column_offset + subj->block_offset + start_column + 1;
+  link->end_column = subj->column_offset + subj->block_offset + end_column + 1;
   cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
   return link;
 }
@@ -325,10 +325,10 @@ static bufsize_t scan_to_closing_backticks(subject *subj,
 // spaces, then removing a single leading + trailing space,
 // unless the code span consists entirely of space characters.
 static void S_normalize_code(cmark_strbuf *s) {
-  bufsize_t r, w;
+  bufsize_t r, w, last_char_after_nl;
   bool contains_nonspace = false;
 
-  for (r = 0, w = 0; r < s->size; ++r) {
+  for (r = 0, w = 0, last_char_after_nl = 0; r < s->size; ++r) {
     switch (s->ptr[r]) {
     case '\r':
       if (s->ptr[r + 1] != '\n') {
@@ -337,15 +337,46 @@ static void S_normalize_code(cmark_strbuf *s) {
       break;
     case '\n':
       s->ptr[w++] = ' ';
+      last_char_after_nl = w;
+      break;
+    case ' ':
+      s->ptr[w++] = s->ptr[r];
       break;
     default:
+      if (last_char_after_nl) {
+        // Remove leading whitespace.
+        bufsize_t remove_len = r - last_char_after_nl;
+
+        if (remove_len) {
+          cmark_strbuf_remove(s, last_char_after_nl, remove_len);
+          w -= remove_len;
+          r -= remove_len;
+        }
+
+        last_char_after_nl = 0;
+      }
+
       s->ptr[w++] = s->ptr[r];
     }
     if (s->ptr[r] != ' ') {
       contains_nonspace = true;
     }
   }
 
+  if (last_char_after_nl) {
+    // Remove leading whitespace. Only reach here if the closing backquote
+    // delimiter is on its own line.
+    bufsize_t remove_len = r - last_char_after_nl;
+
+    if (remove_len) {
+      cmark_strbuf_remove(s, last_char_after_nl, remove_len);
+      w -= remove_len;
+      r -= remove_len;
+    }
+
+    last_char_after_nl = 0;
+  }
+
   // begins and ends with space?
   if (contains_nonspace &&
       s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
@@ -361,13 +392,15 @@ static void S_normalize_code(cmark_strbuf *s) {
 // Parse backtick code section or raw backticks, return an inline.
 // Assumes that the subject has a backtick at the current position.
 static cmark_node *handle_backticks(subject *subj, int options) {
+  // Save the current source position in case of need to rewind.
+  bufsize_t subjpos = subj->pos;
   cmark_chunk openticks = take_while(subj, isbacktick);
   bufsize_t startpos = subj->pos;
   bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
 
   if (endpos == 0) {      // not found
     subj->pos = startpos; // rewind
-    return make_str(subj, subj->pos, subj->pos, openticks);
+    return make_str(subj, subjpos, subjpos, openticks);
   } else {
     cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
 
@@ -797,6 +830,10 @@ static cmark_node *handle_backslash(cmark_parser *parser, subject *subj) {
     advance(subj);
     return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
   } else if (!is_eof(subj) && skip_line_end(subj)) {
+    // Adjust the subject source position state.
+    ++subj->line;
+    subj->column_offset = -subj->pos;
+
     return make_linebreak(subj->mem);
   } else {
     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
@@ -1163,7 +1200,8 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
   inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
   inl->as.link.url = url;
   inl->as.link.title = title;
-  inl->start_line = inl->end_line = subj->line;
+  inl->start_line = opener->inl_text->start_line;
+  inl->end_line = subj->line;
   inl->start_column = opener->inl_text->start_column;
   inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
   cmark_node_insert_before(opener->inl_text, inl);
@@ -1304,10 +1342,21 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
   cmark_chunk contents;
   unsigned char c;
   bufsize_t startpos, endpos;
+  int saved_block_offset = subj->block_offset;
+
   c = peek_char(subj);
   if (c == 0) {
     return 0;
   }
+
+  // If NOT the subject's initial line...
+  if (subj->column_offset != 0) {
+    // Reset the block offset. The line's leading trivia was not trimmed,
+    // so the source position will be computed appropriately without the
+    // block offset.
+    subj->block_offset = 0;
+  }
+
   switch (c) {
   case '\r':
   case '\n':
@@ -1370,12 +1419,27 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
       cmark_chunk_rtrim(&contents);
     }
 
+    // If not the initial line (in the subject) AND at the beginning of another line.
+    if (subj->column_offset != 0 && startpos + subj->column_offset == 0) {
+      // Trim leading whitespace.
+      bufsize_t before_trim = contents.len;
+      cmark_chunk_ltrim(&contents);
+
+      if (contents.len == 0)
+        break; // The contents were only whitespaces.
+
+      // Update the start source position.
+      startpos += before_trim - contents.len;
+    }
+
     new_inl = make_str(subj, startpos, endpos - 1, contents);
   }
   if (new_inl != NULL) {
     cmark_node_append_child(parent, new_inl);
   }
 
+  subj->block_offset = saved_block_offset;
+
   return 1;
 }