Skip to content

Commit 8e1a5b8

Browse files
author
Sandra Tatarevićová
committed
Fix source positions for inlines
Applied patch from commonmark/cmark#298
1 parent beecac3 commit 8e1a5b8

File tree

6 files changed

+157
-10
lines changed

6 files changed

+157
-10
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[submodule "ext/commonmarker/cmark-upstream"]
22
path = ext/commonmarker/cmark-upstream
3-
url = https://github.com/github/cmark-gfm.git
3+
url = https://github.com/orchitech/cmark-gfm.git
44
ignore = dirty

ext/commonmarker/blocks.c

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,37 @@ static void add_line(cmark_node *node, cmark_chunk *ch, cmark_parser *parser) {
207207
cmark_strbuf_putc(&node->content, ' ');
208208
}
209209
}
210+
211+
// If inserting the initial line to the node...
212+
if (node->content.size == 0
213+
// OR the node is a code block...
214+
|| node->type == CMARK_NODE_CODE_BLOCK
215+
// OR the node is a HTML block.
216+
|| node->type == CMARK_NODE_HTML_BLOCK) {
217+
218+
// Then do not insert the leading trivia.
210219
cmark_strbuf_put(&node->content, ch->data + parser->offset,
211220
ch->len - parser->offset);
221+
} else {
222+
// Special case for maintaining the source position of block quotes
223+
// as they can be lazy (i.e. the block quote marker can be omitted).
224+
//
225+
// The simple solution is to replace any block quote markers (">")
226+
// present in the leading trivia with whitespace.
227+
//
228+
// Note: Using `parser->offset` and not `parser->first_nonspace`
229+
// because the latter encompasses the former with the addition of
230+
// whitespace (which we are not interested in).
231+
assert(parser->offset <= parser->first_nonspace);
232+
for (int i = 0; i < parser->offset; i++) {
233+
if (peek_at(ch, i) == '>')
234+
ch->data[i] = ' ';
235+
}
236+
237+
// Otherwise, do not remove leading trivia for appends (i.e. lines
238+
// other than the first).
239+
cmark_strbuf_put(&node->content, ch->data, ch->len);
240+
}
212241
}
213242

214243
static void remove_trailing_blank_lines(cmark_strbuf *ln) {
@@ -266,6 +295,12 @@ static bool resolve_reference_link_definitions(
266295

267296
chunk.data += pos;
268297
chunk.len -= pos;
298+
299+
// Leading whitespace is not stripped.
300+
while (cmark_isspace(peek_at(&chunk, 0))) {
301+
chunk.data += 1;
302+
chunk.len -= 1;
303+
}
269304
}
270305
cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
271306
return !is_blank(&b->content, 0);
@@ -283,13 +318,33 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
283318
CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
284319
b->flags &= ~CMARK_NODE__OPEN;
285320

286-
if (parser->curline.size == 0) {
321+
if (S_type(b) == CMARK_NODE_THEMATIC_BREAK) {
322+
// Already been "finalized".
323+
return parent;
324+
}
325+
326+
if (S_type(b) == CMARK_NODE_HEADING && !b->as.heading.setext) {
327+
parser->last_line_length += b->end_column;
328+
}
329+
330+
if ((S_type(b) == CMARK_NODE_ITEM || S_type(b) == CMARK_NODE_LIST)
331+
&& b->last_child) {
332+
b->end_line = b->last_child->end_line;
333+
b->end_column = b->last_child->end_column;
334+
335+
if (S_type(b) == CMARK_NODE_ITEM && b->parent) {
336+
// The finalization order is not deterministic...
337+
b->parent->end_line = b->end_line;
338+
b->parent->end_column = b->end_column;
339+
}
340+
} else if (parser->curline.size == 0) {
287341
// end of input - line number has not been incremented
288342
b->end_line = parser->line_number;
289343
b->end_column = parser->last_line_length;
290344
} else if (S_type(b) == CMARK_NODE_DOCUMENT ||
291345
(S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
292-
(S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
346+
(S_type(b) == CMARK_NODE_HTML_BLOCK
347+
&& b->end_line == b->start_line && b->end_column == 0)) {
293348
b->end_line = parser->line_number;
294349
b->end_column = parser->curline.size;
295350
if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
@@ -1181,6 +1236,10 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
11811236
// it's only now that we know the line is not part of a setext heading:
11821237
*container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
11831238
parser->first_nonspace + 1);
1239+
// A thematic break can only be on a single line, so we can set the
1240+
// end source position here.
1241+
(*container)->end_line = parser->line_number;
1242+
(*container)->end_column = input->len - 1;
11841243
S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
11851244
} else if (!indented &&
11861245
parser->options & CMARK_OPT_FOOTNOTES &&
@@ -1342,6 +1401,12 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
13421401
} else { // not a lazy continuation
13431402
// Finalize any blocks that were not matched and set cur to container:
13441403
while (parser->current != last_matched_container) {
1404+
if (S_type(parser->current) == CMARK_NODE_HTML_BLOCK) {
1405+
// Edge case: Closing an HTML block without a matching end condition.
1406+
parser->current->end_line = parser->line_number - 1;
1407+
parser->current->end_column = parser->last_line_length;
1408+
}
1409+
13451410
parser->current = finalize(parser, parser->current);
13461411
assert(parser->current != NULL);
13471412
}
@@ -1392,7 +1457,10 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
13921457
} else if (accepts_lines(S_type(container))) {
13931458
if (S_type(container) == CMARK_NODE_HEADING &&
13941459
container->as.heading.setext == false) {
1460+
bufsize_t original_len = input->len;
13951461
chop_trailing_hashtags(input);
1462+
// Substract one to exclude the trailing newline.
1463+
container->end_column += original_len - input->len - 1;
13961464
}
13971465
S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
13981466
false);

ext/commonmarker/buffer.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ void cmark_strbuf_trim(cmark_strbuf *buf) {
242242
cmark_strbuf_rtrim(buf);
243243
}
244244

245+
void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len) {
246+
memmove(buf->ptr + start_offset, buf->ptr + start_offset + len, buf->size - (start_offset + len));
247+
buf->size -= len;
248+
}
249+
245250
// Destructively modify string, collapsing consecutive
246251
// space and newline characters into a single space.
247252
void cmark_strbuf_normalize_whitespace(cmark_strbuf *s) {

ext/commonmarker/buffer.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf);
103103
CMARK_GFM_EXPORT
104104
void cmark_strbuf_trim(cmark_strbuf *buf);
105105

106+
/**
107+
Removes the characters in the given range.
108+
109+
@param buf The string buffer.
110+
@param start_offset The starting character offset.
111+
@param len The length of characters to remove.
112+
*/
113+
CMARK_GFM_EXPORT
114+
void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len);
115+
106116
CMARK_GFM_EXPORT
107117
void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);
108118

ext/commonmarker/inlines.c

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ static CMARK_INLINE cmark_node *make_autolink(subject *subj,
150150
link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
151151
link->as.link.title = cmark_chunk_literal("");
152152
link->start_line = link->end_line = subj->line;
153-
link->start_column = start_column + 1;
154-
link->end_column = end_column + 1;
153+
link->start_column = subj->column_offset + subj->block_offset + start_column + 1;
154+
link->end_column = subj->column_offset + subj->block_offset + end_column + 1;
155155
cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
156156
return link;
157157
}
@@ -325,10 +325,10 @@ static bufsize_t scan_to_closing_backticks(subject *subj,
325325
// spaces, then removing a single leading + trailing space,
326326
// unless the code span consists entirely of space characters.
327327
static void S_normalize_code(cmark_strbuf *s) {
328-
bufsize_t r, w;
328+
bufsize_t r, w, last_char_after_nl;
329329
bool contains_nonspace = false;
330330

331-
for (r = 0, w = 0; r < s->size; ++r) {
331+
for (r = 0, w = 0, last_char_after_nl = 0; r < s->size; ++r) {
332332
switch (s->ptr[r]) {
333333
case '\r':
334334
if (s->ptr[r + 1] != '\n') {
@@ -337,15 +337,46 @@ static void S_normalize_code(cmark_strbuf *s) {
337337
break;
338338
case '\n':
339339
s->ptr[w++] = ' ';
340+
last_char_after_nl = w;
341+
break;
342+
case ' ':
343+
s->ptr[w++] = s->ptr[r];
340344
break;
341345
default:
346+
if (last_char_after_nl) {
347+
// Remove leading whitespace.
348+
bufsize_t remove_len = r - last_char_after_nl;
349+
350+
if (remove_len) {
351+
cmark_strbuf_remove(s, last_char_after_nl, remove_len);
352+
w -= remove_len;
353+
r -= remove_len;
354+
}
355+
356+
last_char_after_nl = 0;
357+
}
358+
342359
s->ptr[w++] = s->ptr[r];
343360
}
344361
if (s->ptr[r] != ' ') {
345362
contains_nonspace = true;
346363
}
347364
}
348365

366+
if (last_char_after_nl) {
367+
// Remove leading whitespace. Only reach here if the closing backquote
368+
// delimiter is on its own line.
369+
bufsize_t remove_len = r - last_char_after_nl;
370+
371+
if (remove_len) {
372+
cmark_strbuf_remove(s, last_char_after_nl, remove_len);
373+
w -= remove_len;
374+
r -= remove_len;
375+
}
376+
377+
last_char_after_nl = 0;
378+
}
379+
349380
// begins and ends with space?
350381
if (contains_nonspace &&
351382
s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
@@ -361,13 +392,15 @@ static void S_normalize_code(cmark_strbuf *s) {
361392
// Parse backtick code section or raw backticks, return an inline.
362393
// Assumes that the subject has a backtick at the current position.
363394
static cmark_node *handle_backticks(subject *subj, int options) {
395+
// Save the current source position in case of need to rewind.
396+
bufsize_t subjpos = subj->pos;
364397
cmark_chunk openticks = take_while(subj, isbacktick);
365398
bufsize_t startpos = subj->pos;
366399
bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
367400

368401
if (endpos == 0) { // not found
369402
subj->pos = startpos; // rewind
370-
return make_str(subj, subj->pos, subj->pos, openticks);
403+
return make_str(subj, subjpos, subjpos, openticks);
371404
} else {
372405
cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
373406

@@ -797,6 +830,10 @@ static cmark_node *handle_backslash(cmark_parser *parser, subject *subj) {
797830
advance(subj);
798831
return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
799832
} else if (!is_eof(subj) && skip_line_end(subj)) {
833+
// Adjust the subject source position state.
834+
++subj->line;
835+
subj->column_offset = -subj->pos;
836+
800837
return make_linebreak(subj->mem);
801838
} else {
802839
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
@@ -1163,7 +1200,8 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
11631200
inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
11641201
inl->as.link.url = url;
11651202
inl->as.link.title = title;
1166-
inl->start_line = inl->end_line = subj->line;
1203+
inl->start_line = opener->inl_text->start_line;
1204+
inl->end_line = subj->line;
11671205
inl->start_column = opener->inl_text->start_column;
11681206
inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
11691207
cmark_node_insert_before(opener->inl_text, inl);
@@ -1304,10 +1342,21 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
13041342
cmark_chunk contents;
13051343
unsigned char c;
13061344
bufsize_t startpos, endpos;
1345+
int saved_block_offset = subj->block_offset;
1346+
13071347
c = peek_char(subj);
13081348
if (c == 0) {
13091349
return 0;
13101350
}
1351+
1352+
// If NOT the subject's initial line...
1353+
if (subj->column_offset != 0) {
1354+
// Reset the block offset. The line's leading trivia was not trimmed,
1355+
// so the source position will be computed appropriately without the
1356+
// block offset.
1357+
subj->block_offset = 0;
1358+
}
1359+
13111360
switch (c) {
13121361
case '\r':
13131362
case '\n':
@@ -1370,12 +1419,27 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
13701419
cmark_chunk_rtrim(&contents);
13711420
}
13721421

1422+
// If not the initial line (in the subject) AND at the beginning of another line.
1423+
if (subj->column_offset != 0 && startpos + subj->column_offset == 0) {
1424+
// Trim leading whitespace.
1425+
bufsize_t before_trim = contents.len;
1426+
cmark_chunk_ltrim(&contents);
1427+
1428+
if (contents.len == 0)
1429+
break; // The contents were only whitespaces.
1430+
1431+
// Update the start source position.
1432+
startpos += before_trim - contents.len;
1433+
}
1434+
13731435
new_inl = make_str(subj, startpos, endpos - 1, contents);
13741436
}
13751437
if (new_inl != NULL) {
13761438
cmark_node_append_child(parent, new_inl);
13771439
}
13781440

1441+
subj->block_offset = saved_block_offset;
1442+
13791443
return 1;
13801444
}
13811445

0 commit comments

Comments
 (0)