from fastcore.test import *
helper.latex
We remark that many of the functions in this module are AI generated or assisted.
Validity of latex syntax
Test latex syntax
We require some functions to evaluate whether a latex math mode string is syntactically valid.
assert not _has_invalid_left_right_bracket(r"\left( x \right)")
assert not _has_invalid_left_right_bracket(r"\left[ x \right]")
assert not _has_invalid_left_right_bracket(r"\left\{ x \right\}")
assert not _has_invalid_left_right_bracket(r"\left< x \right>")
assert not _has_invalid_left_right_bracket(r"\left| x \right|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert _has_invalid_left_right_bracket(r"\lefta x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \righta")
assert _has_invalid_left_right_bracket(r"\left x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \right x")
assert _has_invalid_left_right_bracket(r"\left\backslash x \right/")
assert not _has_invalid_left_right_bracket(r"x + y")
assert _has_invalid_left_right_bracket(r"\left\\")
assert _has_invalid_left_right_bracket(r"\right\\")
= regex_pattern_detecting_command(('Sur', 0, None, r'\mathrm{Sur}'))
pattern = r'The number of element of $\Sur(\operatorname{Cl} \mathcal{O}_L, A)$ is ...'
text = pattern.search(text)
match = match.span()
start, end r'\Sur') test_eq(text[start:end],
extract_commands_from_nodes
extract_commands_from_nodes (commands:list[str], nodes:list[pylatexenc.latexwalker.LatexNode] )
This is a helper function to extract_latex_commands
.
extract_latex_commands
extract_latex_commands (latex_string)
# Example usage
assert extract_latex_commands(r"\frac{a}{b}") == ['frac']
assert extract_latex_commands(r"$\frac{a}{b}$") == ['frac']
assert extract_latex_commands(r"\sqrt[n]{x}") == ['sqrt']
assert extract_latex_commands(r"\binom{n}{k}") == ['binom']
assert extract_latex_commands(r"x^2 + y^2") == [] # No commands, just variables
assert extract_latex_commands(r"\overset{a}{b}") == ['overset']
# Additional tests
assert extract_latex_commands(r"\sum_{i=1}^{n} i") == ['sum']
assert extract_latex_commands(r"\int_{0}^{\infty} e^{-x} dx") == ['int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x)") == ['lim', 'to']
assert extract_latex_commands(r"\prod_{i=1}^{n} i") == ['prod']
assert extract_latex_commands(r"\text{Hello} + \frac{1}{2}") == ['text', 'frac']
# Multiple commands in one string
assert extract_latex_commands(r"\frac{a}{b} + \sqrt{c} + \binom{n}{k}") == ['frac', 'sqrt', 'binom']
assert extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx") == ['sum', 'int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x) = \frac{1}{x}") == ['lim', 'to', 'frac']
assert extract_latex_commands(r"\overset{a}{b} + \underset{c}{d}") == ['overset', 'underset']
assert extract_latex_commands(r"\text{This is } \textbf{bold} + \textit{italic} + \frac{1}{2}") == ['text', 'textbf', 'textit', 'frac']
# Complex expressions
r"\frac{\sum_{i=1}^{n} i}{n} = \frac{n(n+1)}{2}"), ['frac', 'sum', 'frac'])
test_eq(extract_latex_commands(r"\int_{0}^{1} x^2 \, dx = \frac{1}{3}"), ['int', ',', 'frac'])
test_eq(extract_latex_commands(assert extract_latex_commands(r"\sqrt{\frac{a}{b}} + \binom{n}{k}") == ['sqrt', 'frac', 'binom']
# Incorrect synntax
assert extract_latex_commands(r"\frac{}}") == ['frac']
assert extract_latex_commands(r"\frac{a}{b}{c}") == ['frac'] # Extra argument
assert extract_latex_commands(r"\frac{a}{b + \frac{c}{d}}") == ['frac', 'frac'] # Nested command
r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx = \frac{1}{2}"), ['sum', 'int', 'infty', 'frac'])
test_eq(extract_latex_commands(# Comment
assert extract_latex_commands(r"%hi") == []
# Environment Node
r"\begin{align} \end{align}"), ['begin', 'end'])
test_eq(extract_latex_commands(r"\begin{align}"), ['begin'])
test_eq(extract_latex_commands(# test_eq(extract_latex_commands(r"\ begin{align} \end{align}"), [' '])
r'\text\in'), ['text', 'in']) test_eq(extract_latex_commands(
detect_incorrect_latex_commands
detect_incorrect_latex_commands (latex_string:str)
*Return True
if there is at least one syntactically incorrect use of a latex command detected in latex_string
.
This is a helper function to math_mode_string_is_syntactically_valid
.*
detect_unbalanced_environments
detect_unbalanced_environments (latex_string:str)
# Example usage
= r"""
latex_code \begin{document}
This is a sample document.
\begin{itemize}
\item First item
\begin{enumerate}
\item First sub-item
\end{enumerate}
\item Second item
\end{itemize}
\end{document}
\begin{wrongenv} % This environment is unmatched
"""
# Detect unbalanced environments
= detect_unbalanced_environments(latex_code)
unbalanced
# Print the results
# if unbalanced:
# print("Unbalanced environments detected:")
# for error in unbalanced:
# print(error)
# else:
# print("All environments are balanced.")
assert unbalanced
= r"""
latex_code \begin{document}
This is a sample document.
\begin{itemize}
\item First item
\begin{enumerate}
\item First sub-item
\end{enumerate}
\item Second item
\end{itemize}
\end{document}
"""
# Detect unbalanced environments
= detect_unbalanced_environments(latex_code)
unbalanced assert not unbalanced
math_mode_string_is_syntactically_valid
math_mode_string_is_syntactically_valid (text:str)
*Return True
if text
is determined to be syntactically valid as a latex str.
There may be TeX syntax rules beyond the scope of this function.
Some caveats:
text
is allowed to have dollar signs $
and is also allowed to not have dollar signs. Even if text
does not have dollar signs, this function may return True
. Even if text
has dollar signs, this function may return False
if the entire string is not a singular math mode string or if the dollar signs are not used in a math-mode-valid way.*
assert not math_mode_string_is_syntactically_valid(r'$$n=p_1^{e_1} p_2^{e_2} \cdots p_k^$$')
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$ $')
assert math_mode_string_is_syntactically_valid(r'hi')
assert math_mode_string_is_syntactically_valid(r'$hi$')
assert not math_mode_string_is_syntactically_valid(r'$hi$$')
assert math_mode_string_is_syntactically_valid(r'$\\dim ^ a$')
assert not math_mode_string_is_syntactically_valid(r'{ hi')
assert math_mode_string_is_syntactically_valid(r'\{ hi')
assert math_mode_string_is_syntactically_valid(r'\ [')
assert math_mode_string_is_syntactically_valid(r'\left( \right.')
assert not math_mode_string_is_syntactically_valid(r'\left \right.')
assert math_mode_string_is_syntactically_valid(r'$$\left|\sum_{i=0} \right|$$')
assert math_mode_string_is_syntactically_valid(r'$\\\$$')
assert not math_mode_string_is_syntactically_valid(r'\begin{enumerate}')
assert math_mode_string_is_syntactically_valid(r'\begin{enumerate} asdf \end{enumerate}')
assert not math_mode_string_is_syntactically_valid(r'$$R=\sum_P\in X\operatorname length\left(\Omega__X / Y\right)_p\cdot P$$')
# TODO there is something to be considered here; the below
# example would be a syntax error, and yet the functions implemented
# above don't really detect as such.
# assert not detect_incorrect_latex_commands(r'\sideset{_1^2}{_3^4}')
r'\text\in') math_mode_string_is_syntactically_valid(
False
The math_mode_string_is_syntactically_valid
experimentally assesses whether a given math mode LaTeX string is syntactically valid. In principal, this should mean that a LaTeX syntax error caused by the string should be detected by the function.
TODO: consider the following to :
Unescaped % sign (starts a comment): $x = 50% of y$
Using ! (negative space) at the beginning of math mode: $\!x + y$
The following lists some example outputs of the math_mode_string_is_syntactically_valid
function along with explanations.
Unmatched curly braces are a common syntactical error:
assert not math_mode_string_is_syntactically_valid(r'\sqrt{x}}')
However, using \{
or \}
does not count towards curly bracket matching:
assert math_mode_string_is_syntactically_valid(r'\{hi')
On the other hand, a backslash \
followed by spaces and then followed by a curly bracket is in itself an invalid syntax.
assert not math_mode_string_is_syntactically_valid(r'\ {hi')
math_mode_string_is_syntactically_valid
will consider the validity of a string whether or not the string has math mode delimiters.
assert math_mode_string_is_syntactically_valid(r'\operatorname{Gal}')
assert math_mode_string_is_syntactically_valid(r'$\operatorname{Gal}$')
However, math_mode_string_is_syntactically_valid
returns False
if the string has dollar sign delimiters and more than one math mode string is detected in the string (use latex_indices
to separate out math mode strings.),
# More than one math mode string is present
assert not math_mode_string_is_syntactically_valid('$hi$ $bye$')
# the math mode delimiter `$` is unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
# the math mode delimiters `$$` and `$` are unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')
Tweak a latex string
Sometimes, when autogenerating a latex string through an ML model, some minor formatting eyesores occur, such as a curly bracket {
or an underscore _
followed by an unncessary space. We provide some functions to fix such formatting.
reduce_unnecessary_spaces
reduce_unnecessary_spaces (text:str)
Return a string modifying text
by removing spaces which are unnecessary for the purposes of considering the string as a LaTeX string.
# It might not be necessary or desirable to eliminate the space before the backslash `\``
r'something something \ operatorname'), r'something something \operatorname')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res} ^ G_ H (R)'), r'\operatorname{Res}^G_H(R)')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}^{ G}_{ H } (R)'), r'\operatorname{Res}^{G}_{H}(R)')
test_eq(reduce_unnecessary_spaces(r'M_{ f}'), r'M_{f}')
test_eq(reduce_unnecessary_spaces(r'h_{ p}'), r'h_{p}')
test_eq(reduce_unnecessary_spaces(r'\zeta (s)'), r'\zeta(s)')
test_eq(reduce_unnecessary_spaces(r'\mathcal{ H} _{ v}'), r'\mathcal{H}_{v}') test_eq(reduce_unnecessary_spaces(
Make fixes to summary
fix_autogen_formatting
fix_autogen_formatting (text:str)
Fix some latex formatting issues in an autogenerated text
Currently, the model is inclined to decode and format its summarizations in such a way that creates formatting issues either for LaTeX or Obsidian.md
. For example, the model would output a str containing
\ <command_name>
instead of\<command_name>
{
when{
is preferable$ <latex_string> $
when$<latex_string>$
is needed forObsidian.md
.
The fix_summary_formatting
function attempts to get around some of these issues.
= r'\ to'
text = fix_autogen_formatting(text)
sample_output assert r'\to' in sample_output
= r'$d\ in\ mathbb{ Z}_{\ geq 0} $'
text = fix_autogen_formatting(text)
sample_output assert r'\in' in sample_output
assert r'\mathbb{Z}' in sample_output
assert r'\geq 0' in sample_output
= r'There are some extra spaces in this math mode string: $ 5 + 7 = 12 $.'
text = fix_autogen_formatting(text)
sample_output print(sample_output)
assert r'$5' in sample_output
assert r'12$' in sample_output
There are some extra spaces in this math mode string: $5 + 7 = 12$ .
= r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
text= fix_autogen_formatting(text)
sample_output print(sample_output)
the group of $G$ -coinvariants of $A$ . It is defined as
$$A_{G} :=A / I_\G} A$$
Correct syntax errors in autogenerated math mode strings
correct_latex_syntax_error
correct_latex_syntax_error (summary:str, replacement_candidates:list[str], syntax_vali dation:Callable[[str],bool]=<function math_mode_string_is_syntactically_valid>)
*Attempt to replace within summary
a modified version in which the syntactically incorrect latex math mode strings are replaced with the most closely resembling element of replacement_candidates
.
with a modified version in which the latex math mode strings within summary
that are syntactically incorrect
TODO: consider the possibility that not all math mode str delimiters are formatted correctly.*
Type | Default | Details | |
---|---|---|---|
summary | str | The autogenerated summary | |
replacement_candidates | list | A list of candidates to replace. This is expected to be an output of _list_of_candidates_from_math_mode_strings |
|
syntax_validation | Callable | math_mode_string_is_syntactically_valid | A test to tell whether a math mode string is syntactically valid. |
Returns | str |
= r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
sample_summary = [
replacement_candidates 'A',
'A_',
'A_{G}',
'A_{G}:=A',
'A_{G}:=A',
'A_{G}:=A /',
'A_{G}:=A / I_{G}',
'A_{G}:=A / I_{G} A',
'H_{0}(G, A)',
'H_{0}(G, A) \\simeq',
'H_{0}(G, A) \\simeq A',
'H_{0}(G, A) \\simeq A_',
'H_{0}(G, A) \\simeq A_{G}',
]r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G}:=A / I_{G} A$$')
test_eq(correct_latex_syntax_error(sample_summary, replacement_candidates), # replacement_candidates