diff --git a/.gitignore b/.gitignore index 591d6c5f..7ce35ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ prof-traces test/ds/good.args test/autocomplete_byexample.sh .check-secrets-impl.sh +build/ diff --git a/byexample/example.py b/byexample/example.py index 39494a23..5dc32c80 100644 --- a/byexample/example.py +++ b/byexample/example.py @@ -82,6 +82,7 @@ class Example(object): >>> example.options {'capture': True, + 'ignore_first_empty_lines': True, 'input_prefix_range': (6, 12), 'norm_ws': False, 'rm': [], diff --git a/byexample/expected.py b/byexample/expected.py index 5544d36b..161093ff 100644 --- a/byexample/expected.py +++ b/byexample/expected.py @@ -32,7 +32,7 @@ class _LinearExpected(Expected): >>> from byexample.options import Options >>> from byexample.finder import _build_fake_example as build_example - >>> opts = {'norm_ws': False, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6,12)} + >>> opts = {'norm_ws': False, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6,12), 'ignore_first_empty_lines': True} Consider the following example with a named capture in the expected: @@ -145,7 +145,7 @@ class _LinearExpected(Expected): (See byexample.parser docs) - >>> opts = {'norm_ws': True, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6, 12)} + >>> opts = {'norm_ws': True, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6, 12), 'ignore_first_empty_lines': True} >>> ex = build_example('f()', '\n A \n\nB C\n', opts=opts) >>> exp = ex.expected diff --git a/byexample/finder.py b/byexample/finder.py index 33b2c417..89469e1e 100644 --- a/byexample/finder.py +++ b/byexample/finder.py @@ -51,7 +51,8 @@ class F: 'capture': True, 'rm': [], 'type': False, - 'input_prefix_range': (6, 12) + 'input_prefix_range': (6, 12), + 'ignore_first_empty_lines': True, } ) parser.extract_options = lambda x: opts diff --git a/byexample/init.py b/byexample/init.py index bb5cc13c..e3dd6799 100644 --- a/byexample/init.py +++ b/byexample/init.py @@ -386,6 +386,12 @@ def get_default_options_parser(cmdline_args): options_parser.add_flag( "norm-ws", default=False, help="ignore the amount of whitespaces." ) + options_parser.add_flag( + "ignore-first-empty-lines", + default=True, + help= + "ignore any empty or whitespace-only lines at the begin of the got string." + ) options_parser.add_flag( "pass", default=False, diff --git a/byexample/parser.py b/byexample/parser.py index 36dcba02..b57e16e9 100644 --- a/byexample/parser.py +++ b/byexample/parser.py @@ -193,7 +193,8 @@ def parse(self, example, concerns): input_prefix_len_range = options['input_prefix_range'] expected_regexs, charnos, rcounts, tags_by_idx, input_list = self.expected_as_regexs( example.expected_str, options['tags'], options['capture'], - options['type'], options['norm_ws'], input_prefix_len_range + options['type'], options['norm_ws'], input_prefix_len_range, + options['ignore_first_empty_lines'] ) ExpectedClass = _LinearExpected @@ -230,8 +231,14 @@ def parse(self, example, concerns): @profile def expected_as_regexs( - self, expected, tags_enabled, capture_enabled, input_enabled, - normalize_whitespace, input_prefix_len_range + self, + expected, + tags_enabled, + capture_enabled, + input_enabled, + normalize_whitespace, + input_prefix_len_range, + ignore_first_empty_lines=True ): r''' From the expected string create a list of regular expressions that @@ -262,8 +269,8 @@ def expected_as_regexs( We return the regexs - >>> regexs - ('\\A', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '\\n*\\Z') + >>> regexs # byexample: +norm-ws + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '\\n*\\Z') >>> m = re.compile(''.join(regexs), re.MULTILINE | re.DOTALL) >>> m.match('axxbyyyc').groups() @@ -300,7 +307,7 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs # byexample: +norm-ws - ('\\A', 'a', '(?:.*?)(?.*?)', 'c', '\\s*\\Z') + ('\\A\\s*?', 'a', '(?:.*?)(?.*?)', 'c', '\\s*\\Z') >>> tags_by_idx {2: None, 4: 'foo-bar'} @@ -315,7 +322,7 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected) >>> regexs - ('\\A', 'abc', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'abc', '\\n*\\Z') >>> tags_by_idx {} @@ -324,11 +331,24 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected) >>> regexs - ('\\A', 'a', '(?:.*?)', 'bc', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?:.*?)', 'bc', '\\n*\\Z') >>> tags_by_idx {2: None} + When ignore_first_empty_lines is False the begin anchor is a plain \\A, + so the got string must start exactly at the first expected character. + + >>> regexs, _, _, _, _ = _as_regexs('foo', ignore_first_empty_lines=False) + + >>> regexs + ('\\A', 'foo', '\\n*\\Z') + + >>> regexs, _, _, _, _ = _as_regexs('foo', normalize_whitespace=True, ignore_first_empty_lines=False) + + >>> regexs + ('\\A', 'foo', '\\s*\\Z') + ''' if capture_enabled: tag_regexs = self.tag_regexs() @@ -338,12 +358,12 @@ def expected_as_regexs( if normalize_whitespace: sm = SM_NormWS( tag_regexs, self.input_regexs(), self.ellipsis_marker(), - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) else: sm = SM_NotNormWS( tag_regexs, self.input_regexs(), self.ellipsis_marker(), - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) return sm.parse(expected, tags_enabled, input_enabled) @@ -433,7 +453,7 @@ def _extend_parser_and_parse_options_strictly_and_cache(self, optlist): >>> regexs, _, _, _, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs -('\\A', +('\\A\\s*?', 'ex', '\\s', '(?:\\s*(?!\\s)(?:.+)(?>> regexs, _, _, _, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs -('\\A', +('\\A\\s*?', 'ex', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?>> r, p, c, _, _ = _as_regexs('a \n b \t\vc') >>> r - ('\\A', 'a', '\\s+(?!\\s)', 'b', '\\s+(?!\\s)', 'c', '\\s*\\Z') + ('\\A\\s*?', 'a', '\\s+(?!\\s)', 'b', '\\s+(?!\\s)', 'c', '\\s*\\Z') >>> match(r, 'a b c') is not None True @@ -845,7 +899,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs - ('\\A', 'a', '(?P.*?)', 'b', '\\s*\\Z') + ('\\A\\s*?', 'a', '(?P.*?)', 'b', '\\s*\\Z') >>> p (0, 0, 1, 6, 7) @@ -861,7 +915,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', 'a', '\\s+(?!\\s)', '(?P.*?)', 'b', '\\s*\\Z') + ('\\A\\s*?', 'a', '\\s+(?!\\s)', '(?P.*?)', 'b', '\\s*\\Z') >>> p (0, 0, 1, 2, 7, 8) @@ -873,7 +927,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', 'a', '(?P.*?)(?.*?)(?>> p (0, 0, 1, 6, 7, 8) @@ -888,7 +942,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: +norm-ws -tags - ('\\A', 'a', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 2, 7, 8, 9) @@ -918,7 +972,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?P.*?)(?.*?)(?>> p (0, 0, 5) @@ -930,7 +984,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 6) @@ -942,7 +996,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 6) @@ -954,7 +1008,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?P.*?)(?.*?)(?>> p (0, 0, 5) @@ -962,11 +1016,37 @@ def parse(self, expected, tags_enabled, input_enabled): >>> match(regexs, ' 123 \n\n\n\n').groups() (' 123',) + When the expected starts with whitespace, \A\s*? + \s+(?!\s) would + be quadratic on whitespace-only got strings. They are folded into + the single equivalent \A\s+(?!\s) which is linear (greedy, no + overlap). + + >>> expected = ' foo' + >>> regexs, p, _, _, _ = _as_regexs(expected) + + >>> regexs # byexample: -tags + ('\\A\\s+(?!\\s)', 'foo', '\\s*\\Z') + + >>> p + (0, 2, 5) + + This still skips any extra leading whitespace in the got, just + like \A\s*?\s+(?!\s) would, because \s+ is greedy from \A. + + >>> match(regexs, '\n\n foo').groups() + () + + >>> match(regexs, ' foo').groups() + () + + >>> match(regexs, 'foo') is None + True + >>> expected = ' ' >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s*\\Z') + ('\\A\\s*?', '\\s*\\Z') >>> p (0, 0) @@ -975,7 +1055,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s*\\Z') + ('\\A\\s*?', '\\s*\\Z') >>> p (0, 0) @@ -989,7 +1069,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, _, input_list = _as_regexs(expected) >>> regexs # byexample: +norm-ws - ('\\A', 'username', '\\s+(?!\\s)', '\\[john\\]', '\\s+(?!\\s)', + ('\\A\\s*?', 'username', '\\s+(?!\\s)', '\\[john\\]', '\\s+(?!\\s)', 'pass', '\\s+(?!\\s)', '\\[admin\\]', '\\s+(?!\\s)', 'comment', '\\s+(?!\\s)', '\\[', '\\s+(?!\\s)', 'none', '\\s+(?!\\s)', '\\]', '\\s*\\Z') @@ -1010,17 +1090,39 @@ def parse(self, expected, tags_enabled, input_enabled): class SM_NotNormWS(SM): def __init__( - self, tag_regexs, input_regexs, ellipsis_marker, input_prefix_len_range + self, + tag_regexs, + input_regexs, + ellipsis_marker, + input_prefix_len_range, + ignore_first_empty_lines=True ): SM.__init__( self, tag_regexs, input_regexs, ellipsis_marker, - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) @constant def trailing_newlines_regex(self): return re.compile(r'\n*\Z', re.MULTILINE | re.DOTALL) + def _begin_of_string_regex(self): + r''' + If ignore_first_empty_lines is True (the default), skip any leading + empty or whitespace-only lines in the got before matching content. + + A non-greedy *? is used to avoid consuming lines that the expected + regex (e.g. a tag) may need to match itself. This is safe and + non-pathological: each iteration of the group consumes at least one + \n, so the total work is linear in the number of leading blank lines. + + If ignore_first_empty_lines is False, use a plain \A anchor so that + the got string must start exactly where the expected content begins. + ''' + if self.ignore_first_empty_lines: + return r'\A(?:[ \t]*\n)*?' + return r'\A' + def emit_tag(self, ctx, endline): assert ctx in ('n', '0') return SM.emit_tag(self, ctx, endline) @@ -1105,7 +1207,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, tags_by_idx, input_list = _as_regexs(expected) >>> regexs # byexample: -tags +norm-ws - ('\\A', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '(?:.*?)', 'd', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '(?:.*?)', 'd', '\\n*\\Z') >>> match(regexs, 'axxbyyyczzd').groups() ('xx', 'yyy') @@ -1152,7 +1254,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, rcounts, _, _ = _as_regexs(expected) >>> regexs # byexample: +norm-ws -tags - ('\\A', + ('\\A(?:[ \\t]*\\n)*?', 'a', '\\\n', '(?P.*?)', @@ -1200,7 +1302,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?:(?P.+?)(?.+?)(?>> match(regexs, ' 123 \n\n\n\n').groups() (' 123 ',) @@ -1209,7 +1311,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?:(?P.+?)(?.+?)(?>> match(regexs, '123\n\n\n\n').groups() ('123',) @@ -1218,7 +1320,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\\n', '(?:(?P.+?)(?.+?)(?>> match(regexs, '\n123\n\n\n\n').groups() ('123',) @@ -1235,7 +1337,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, _, input_list = _as_regexs(expected) >>> regexs # byexample: +norm-ws - ('\\A', 'username', '\\ ', '\\[john\\]', '\\\n', + ('\\A(?:[ \\t]*\\n)*?', 'username', '\\ ', '\\[john\\]', '\\\n', 'pass', '\\ ', '\\[admin\\]', '\\ \\ ', '\\\n', 'comment', '\\ ', '\\[', '\\ ', 'none', '\\ ', '\\]', '\\n*\\Z') diff --git a/docs/basic/normalize-whitespace.md b/docs/basic/normalize-whitespace.md index d512dfcb..d888fc8d 100644 --- a/docs/basic/normalize-whitespace.md +++ b/docs/basic/normalize-whitespace.md @@ -1,3 +1,11 @@ + # Normalize Whitespace Replace any sequence of whitespace by a single one. @@ -35,3 +43,77 @@ Here is another example, this time written in ``Ruby``: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] ``` +## Empty lines at the begin are ignored by default + + +Consider the following `"\n \nSome line"` output. The following three +examples matches because by default `byexample` discards any empty line +at the begin of the output. + +```python +>>> someline = "\n \nSome line" + +>>> print(someline) # OK: <...> captures the empty lines +<...> +Some line + +>>> print(someline) # OK too: the same reason above +<...>Some line + +>>> print(someline) # OK: byexample ignores the empty lines "as if" a <...> was there +Some line +``` + +`byexample` understands as "empty lines" lines made entirely of spaces +and tabs ended with a new line. It is subtle but such definition does +not include indentation. + +Consider the following `"\n \n Some indented line"`: + +```python +>>> someindented = "\n \n Some indented line" + +>>> print(someindented) # FAIL: the example is not expecting indentation # byexample: +pass +<...> +Some indented line + +>>> print(someindented) # OK: <...> captures all including the indentation +<...>Some indented line + +>>> print(someindented) # FAIL: byexample ignores the empty lines but not the indentation # byexample: +pass +Some indented line +``` + +When `+norm-ws` is enabled, those two `FAIL` examples will work because +`byexample` relaxes the definition of empty lines and replaces by +"any whitespace" which the indentation gets included: + +```python +>>> print(someindented) # byexample: +norm-ws +<...> +Some indented line + +>>> print(someindented) # byexample: +norm-ws +Some indented line +``` + +> *New* in `byexample 11.0.0`: before `11.0.0` it was up to the user to +> put a <...> or similar to ignore the empty lines at the begin (or use +> `+rm=~` combined with `+norm-ws`). +> Since `11.0.0` this is the default. If you want to old behavior you +> can use the flag `-ignore-first-empty-lines` + + diff --git a/docs/contrib/how-to-support-new-finders-and-languages.md b/docs/contrib/how-to-support-new-finders-and-languages.md index ceab692c..ea23a1a0 100644 --- a/docs/contrib/how-to-support-new-finders-and-languages.md +++ b/docs/contrib/how-to-support-new-finders-and-languages.md @@ -281,7 +281,7 @@ the scenes so you do not to be worry about the details): ```python >>> from byexample.options import Options, OptionParser ->>> parser = ArnoldCParser(cfg=Config(verbosity=0, encoding='utf-8', options=Options(rm=[], norm_ws=False, tags=True, capture=True, type=False, input_prefix_range=(6,12), optparser=OptionParser(add_help=False)))) +>>> parser = ArnoldCParser(cfg=Config(verbosity=0, encoding='utf-8', options=Options(rm=[], norm_ws=False, tags=True, capture=True, type=False, input_prefix_range=(6,12), ignore_first_empty_lines=True, optparser=OptionParser(add_help=False)))) >>> from byexample.finder import Example >>> runner = None # not yet diff --git a/test/bad-empty-line.md b/test/bad-empty-line.md new file mode 100644 index 00000000..4eb9f14c --- /dev/null +++ b/test/bad-empty-line.md @@ -0,0 +1,11 @@ +```python +>>> print("\n \n Some line") # should fail (missing indentation) +<...> +Some line + +>>> print("\n \n Some line") # should fail (missing indentation) +Some line + +>>> print("\n \nSome line") # should fail because we are using pre-11.0.0 behavour # byexample: -ignore-first-empty-lines +Some line +```