perf(treesitter): run injection query only for visible lines

Problem:
Executing injection query on the full source is slow.

Solution:
Execute injection query only on the given range.

Notes
* This is not applicable to languages with combined injection.
* `is_valid(false)` should run full injection to determine if the
  current set of children parsers and their regions are complete. Since
  this can be slow, `parse()` no longer checks this at the beginning.
* Children parsers and regions outside the given range are discarded.
This commit is contained in:
Jaehwang Jung 2024-03-07 11:15:06 +09:00
parent 0365f5a82c
commit e24ca40217
4 changed files with 99 additions and 51 deletions

View File

@ -1338,7 +1338,8 @@ LanguageTree:invalidate({reload}) *LanguageTree:invalidate()*
LanguageTree:is_valid({exclude_children}) *LanguageTree:is_valid()*
Returns whether this LanguageTree is valid, i.e., |LanguageTree:trees()|
reflects the latest state of the source. If invalid, user should call
|LanguageTree:parse()|.
|LanguageTree:parse()|. `is_valid(false)` can be slow because it runs
injection on the full source.
Parameters: ~
• {exclude_children} (`boolean?`) whether to ignore the validity of

View File

@ -72,8 +72,12 @@ local TSCallbackNames = {
---@field private _callbacks table<TSCallbackName,function[]> Callback handlers
---@field package _callbacks_rec table<TSCallbackName,function[]> Callback handlers (recursive)
---@field private _children table<string,vim.treesitter.LanguageTree> Injected languages
---@field private _injection_query vim.treesitter.Query Queries defining injected languages
---@field private _injection_query vim.treesitter.Query? Queries defining injected languages
---
---If `is_valid(true) and _injections_processed`, the set of children parsers and their sets of
---regions are complete wrt. the full source, so that it's not necessary to execute injections.
---@field private _injections_processed boolean
---
---@field private _opts table Options
---@field private _parser TSParser Parser for language
---@field private _has_regions boolean
@ -271,6 +275,7 @@ end
--- Returns whether this LanguageTree is valid, i.e., |LanguageTree:trees()| reflects the latest
--- state of the source. If invalid, user should call |LanguageTree:parse()|.
--- `is_valid(false)` can be slow because it runs injection on the full source.
---@param exclude_children boolean|nil whether to ignore the validity of children (default `false`)
---@return boolean
function LanguageTree:is_valid(exclude_children)
@ -285,8 +290,11 @@ function LanguageTree:is_valid(exclude_children)
end
if not exclude_children then
-- Run full injection to check if the current set of children and their regions are complete.
-- Note that `set_included_regions` marks new regions invalid.
if not self._injections_processed then
return false
self:_add_injections(true)
self._injections_processed = true
end
for _, child in pairs(self._children) do
@ -384,11 +392,12 @@ function LanguageTree:_parse_regions(range)
end
--- @private
--- @param range boolean|Range|nil
--- @return number
function LanguageTree:_add_injections()
function LanguageTree:_add_injections(range)
local seen_langs = {} ---@type table<string,boolean>
local query_time, injections_by_lang = tcall(self._get_injections, self)
local query_time, injections_by_lang = tcall(self._get_injections, self, range)
for lang, injection_regions in pairs(injections_by_lang) do
local has_lang = pcall(language.add, lang)
@ -437,11 +446,6 @@ end
--- only the root tree without injections).
--- @return table<integer, TSTree>
function LanguageTree:parse(range)
if self:is_valid() then
self:_log('valid')
return self._trees
end
local changes --- @type Range6[]?
-- Collect some stats
@ -458,9 +462,19 @@ function LanguageTree:parse(range)
end
end
if not self._injections_processed and range ~= false and range ~= nil then
query_time = self:_add_injections()
self._injections_processed = true
-- NOTE: Trade-off in partial injection query execution
-- * The good: Each `parse()` is faster.
-- * The bad: `is_valid(false)` is more expensive, requiring a full injection query execution. To
-- avoid this cost, each `parse()` always runs partial injection. However, this is not a big
-- problem as partial injection is very cheap even on huge files.
-- * A potential optimization: Track the ranges where the set of injected regions are known to be
-- complete and valid, and run the injection query only on the intersection of requested ranges
-- and the invalid ranges. This would be even more beneficial for combined injection.
if self._injection_query and not self._injections_processed and range then
query_time = self:_add_injections(range)
if range == true or self._injection_query.has_combined_injection then
self._injections_processed = true
end
end
self:_log({
@ -995,36 +1009,54 @@ end
---
--- This is where most of the injection processing occurs.
---
--- TODO: Allow for an offset predicate to tailor the injection range
--- instead of using the entire nodes range.
--- @param range boolean|Range|nil
--- @private
--- @return table<string, Range6[][]>
function LanguageTree:_get_injections()
if not self._injection_query then
function LanguageTree:_get_injections(range)
if not self._injection_query or not range then
return {}
end
---@type table<integer,vim.treesitter.languagetree.Injection>
local injections = {}
local range_start_line, range_end_line ---@type integer, integer
if range ~= true then
local sline, _, eline, _ = Range.unpack4(range)
range_start_line, range_end_line = sline, eline
end
for index, tree in pairs(self._trees) do
local root_node = tree:root()
local start_line, _, end_line, _ = root_node:range()
local start_line, _, end_line, end_col = root_node:range()
if end_col > 0 then
end_line = end_line + 1
end
for pattern, match, metadata in
self._injection_query:iter_matches(
root_node,
self._source,
start_line,
end_line + 1,
{ all = true }
)
do
local lang, combined, ranges = self:_get_injection(match, metadata)
if lang then
add_injection(injections, index, pattern, lang, combined, ranges)
else
self:_log('match from injection query failed for pattern', pattern)
-- If the query doesn't have combined injection, run the query on the given range. Combined
-- injection must be run on the full range. Currently there is no simply way to selectively
-- match each pattern separately.
if range ~= true and not self._injection_query.has_combined_injection then
start_line = math.max(start_line, range_start_line)
end_line = math.min(end_line, range_end_line)
end
if start_line < end_line then
for pattern, match, metadata in
self._injection_query:iter_matches(
root_node,
self._source,
start_line,
end_line,
{ all = true }
)
do
local lang, combined, ranges = self:_get_injection(match, metadata)
if lang then
add_injection(injections, index, pattern, lang, combined, ranges)
else
self:_log('match from injection query failed for pattern', pattern)
end
end
end
end

View File

@ -11,6 +11,7 @@ local M = {}
---@field lang string name of the language for this parser
---@field captures string[] list of (unique) capture names defined in query
---@field info vim.treesitter.QueryInfo contains information used in the query (e.g. captures, predicates, directives)
---@field has_combined_injection true? whether this query has a combined injection pattern
---@field query TSQuery userdata query object
local Query = {}
Query.__index = Query
@ -30,6 +31,18 @@ function Query.new(lang, ts_query)
patterns = query_info.patterns,
}
self.captures = self.info.captures
for _, preds in pairs(self.info.patterns) do
if
vim.tbl_contains(preds, function(pred)
return vim.deep_equal(pred, { 'set!', 'injection.combined' })
end, { predicate = true })
then
self.has_combined_injection = true
break
end
end
return self
end

View File

@ -853,8 +853,9 @@ print()
]]
)
-- Regions outside the given range are discarded.
eq(
2,
1,
exec_lua [[
parser:parse({2, 6})
return vim.tbl_count(parser:children().lua:trees())
@ -997,19 +998,7 @@ print()
]]
end
it('is valid excluding, invalid including children initially', function()
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a full parse', function()
exec_lua('parser:parse(true)')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
it('is fully valid after a parsing a range on parsed tree', function()
exec_lua('vim.treesitter.get_parser():parse({5, 7})')
it('is valid including children since it does not have one', function()
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
@ -1082,17 +1071,30 @@ print()
eq(false, exec_lua('return parser:is_valid()'))
end)
it('is valid excluding, invalid including children after a rangeless parse', function()
exec_lua('parser:parse()')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end)
it(
'is fully valid after a rangeless parse, since the only change to the children was removing a region',
function()
exec_lua('parser:parse()')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end
)
it('is fully valid after a range parse that includes injection region', function()
exec_lua('parser:parse({5, 7})')
eq(true, exec_lua('return parser:is_valid(true)'))
eq(true, exec_lua('return parser:is_valid()'))
end)
it(
'is valid excluding, invalid including children after a range parse that does not include injection region',
function()
exec_lua('parser:parse({2, 4})')
eq(vim.NIL, get_regions())
eq(true, exec_lua('return parser:is_valid(true)'))
eq(false, exec_lua('return parser:is_valid()'))
end
)
end)
describe('when editing an injection region', function()