from fastcore.test import *helper.latex
We remark that many of the functions in this module are AI generated or assisted.
Validity of latex syntax
Test latex syntax
We require some functions to evaluate whether a latex math mode string is syntactically valid.
assert not _has_invalid_left_right_bracket(r"\left( x \right)")
assert not _has_invalid_left_right_bracket(r"\left[ x \right]")
assert not _has_invalid_left_right_bracket(r"\left\{ x \right\}")
assert not _has_invalid_left_right_bracket(r"\left< x \right>")
assert not _has_invalid_left_right_bracket(r"\left| x \right|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert _has_invalid_left_right_bracket(r"\lefta x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \righta")
assert _has_invalid_left_right_bracket(r"\left x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \right x")
assert _has_invalid_left_right_bracket(r"\left\backslash x \right/")
assert not _has_invalid_left_right_bracket(r"x + y")
assert _has_invalid_left_right_bracket(r"\left\\")
assert _has_invalid_left_right_bracket(r"\right\\")pattern = regex_pattern_detecting_command(('Sur', 0, None, r'\mathrm{Sur}'))
text = r'The number of element of $\Sur(\operatorname{Cl} \mathcal{O}_L, A)$ is ...'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], r'\Sur')extract_commands_from_nodes
extract_commands_from_nodes (commands:list[str], nodes:list[pylatexenc.latexwalker.LatexNode] )
This is a helper function to extract_latex_commands.
extract_latex_commands
extract_latex_commands (latex_string)
# Example usage
assert extract_latex_commands(r"\frac{a}{b}") == ['frac']
assert extract_latex_commands(r"$\frac{a}{b}$") == ['frac']
assert extract_latex_commands(r"\sqrt[n]{x}") == ['sqrt']
assert extract_latex_commands(r"\binom{n}{k}") == ['binom']
assert extract_latex_commands(r"x^2 + y^2") == [] # No commands, just variables
assert extract_latex_commands(r"\overset{a}{b}") == ['overset']
# Additional tests
assert extract_latex_commands(r"\sum_{i=1}^{n} i") == ['sum']
assert extract_latex_commands(r"\int_{0}^{\infty} e^{-x} dx") == ['int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x)") == ['lim', 'to']
assert extract_latex_commands(r"\prod_{i=1}^{n} i") == ['prod']
assert extract_latex_commands(r"\text{Hello} + \frac{1}{2}") == ['text', 'frac']
# Multiple commands in one string
assert extract_latex_commands(r"\frac{a}{b} + \sqrt{c} + \binom{n}{k}") == ['frac', 'sqrt', 'binom']
assert extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx") == ['sum', 'int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x) = \frac{1}{x}") == ['lim', 'to', 'frac']
assert extract_latex_commands(r"\overset{a}{b} + \underset{c}{d}") == ['overset', 'underset']
assert extract_latex_commands(r"\text{This is } \textbf{bold} + \textit{italic} + \frac{1}{2}") == ['text', 'textbf', 'textit', 'frac']
# Complex expressions
test_eq(extract_latex_commands(r"\frac{\sum_{i=1}^{n} i}{n} = \frac{n(n+1)}{2}"), ['frac', 'sum', 'frac'])
test_eq(extract_latex_commands(r"\int_{0}^{1} x^2 \, dx = \frac{1}{3}"), ['int', ',', 'frac'])
assert extract_latex_commands(r"\sqrt{\frac{a}{b}} + \binom{n}{k}") == ['sqrt', 'frac', 'binom']
# Incorrect synntax
assert extract_latex_commands(r"\frac{}}") == ['frac']
assert extract_latex_commands(r"\frac{a}{b}{c}") == ['frac'] # Extra argument
assert extract_latex_commands(r"\frac{a}{b + \frac{c}{d}}") == ['frac', 'frac'] # Nested command
test_eq(extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx = \frac{1}{2}"), ['sum', 'int', 'infty', 'frac'])
# Comment
assert extract_latex_commands(r"%hi") == []
# Environment Node
test_eq(extract_latex_commands(r"\begin{align} \end{align}"), ['begin', 'end'])
test_eq(extract_latex_commands(r"\begin{align}"), ['begin'])
# test_eq(extract_latex_commands(r"\ begin{align} \end{align}"), [' '])
test_eq(extract_latex_commands(r'\text\in'), ['text', 'in'])detect_incorrect_latex_commands
detect_incorrect_latex_commands (latex_string:str)
*Return True if there is at least one syntactically incorrect use of a latex command detected in latex_string.
This is a helper function to math_mode_string_is_syntactically_valid.*
detect_unbalanced_environments
detect_unbalanced_environments (latex_string:str)
# Example usage
latex_code = r"""
\begin{document}
This is a sample document.
\begin{itemize}
\item First item
\begin{enumerate}
\item First sub-item
\end{enumerate}
\item Second item
\end{itemize}
\end{document}
\begin{wrongenv} % This environment is unmatched
"""
# Detect unbalanced environments
unbalanced = detect_unbalanced_environments(latex_code)
# Print the results
# if unbalanced:
# print("Unbalanced environments detected:")
# for error in unbalanced:
# print(error)
# else:
# print("All environments are balanced.")
assert unbalanced
latex_code = r"""
\begin{document}
This is a sample document.
\begin{itemize}
\item First item
\begin{enumerate}
\item First sub-item
\end{enumerate}
\item Second item
\end{itemize}
\end{document}
"""
# Detect unbalanced environments
unbalanced = detect_unbalanced_environments(latex_code)
assert not unbalancedmath_mode_string_is_syntactically_valid
math_mode_string_is_syntactically_valid (text:str)
*Return True if text is determined to be syntactically valid as a latex str.
There may be TeX syntax rules beyond the scope of this function.
Some caveats:
text is allowed to have dollar signs $ and is also allowed to not have dollar signs. Even if text does not have dollar signs, this function may return True. Even if text has dollar signs, this function may return False if the entire string is not a singular math mode string or if the dollar signs are not used in a math-mode-valid way.*
assert not math_mode_string_is_syntactically_valid(r'$$n=p_1^{e_1} p_2^{e_2} \cdots p_k^$$')
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$ $')
assert math_mode_string_is_syntactically_valid(r'hi')
assert math_mode_string_is_syntactically_valid(r'$hi$')
assert not math_mode_string_is_syntactically_valid(r'$hi$$')
assert math_mode_string_is_syntactically_valid(r'$\\dim ^ a$')
assert not math_mode_string_is_syntactically_valid(r'{ hi')
assert math_mode_string_is_syntactically_valid(r'\{ hi')
assert math_mode_string_is_syntactically_valid(r'\ [')
assert math_mode_string_is_syntactically_valid(r'\left( \right.')
assert not math_mode_string_is_syntactically_valid(r'\left \right.')
assert math_mode_string_is_syntactically_valid(r'$$\left|\sum_{i=0} \right|$$')
assert math_mode_string_is_syntactically_valid(r'$\\\$$')
assert not math_mode_string_is_syntactically_valid(r'\begin{enumerate}')
assert math_mode_string_is_syntactically_valid(r'\begin{enumerate} asdf \end{enumerate}')
assert not math_mode_string_is_syntactically_valid(r'$$R=\sum_P\in X\operatorname length\left(\Omega__X / Y\right)_p\cdot P$$')
# TODO there is something to be considered here; the below
# example would be a syntax error, and yet the functions implemented
# above don't really detect as such.
# assert not detect_incorrect_latex_commands(r'\sideset{_1^2}{_3^4}')
math_mode_string_is_syntactically_valid(r'\text\in')False
The math_mode_string_is_syntactically_valid experimentally assesses whether a given math mode LaTeX string is syntactically valid. In principal, this should mean that a LaTeX syntax error caused by the string should be detected by the function.
TODO: consider the following to :
Unescaped % sign (starts a comment): $x = 50% of y$
Using ! (negative space) at the beginning of math mode: $\!x + y$
The following lists some example outputs of the math_mode_string_is_syntactically_valid function along with explanations.
Unmatched curly braces are a common syntactical error:
assert not math_mode_string_is_syntactically_valid(r'\sqrt{x}}')However, using \{ or \} does not count towards curly bracket matching:
assert math_mode_string_is_syntactically_valid(r'\{hi')On the other hand, a backslash \ followed by spaces and then followed by a curly bracket is in itself an invalid syntax.
assert not math_mode_string_is_syntactically_valid(r'\ {hi')math_mode_string_is_syntactically_valid will consider the validity of a string whether or not the string has math mode delimiters.
assert math_mode_string_is_syntactically_valid(r'\operatorname{Gal}')
assert math_mode_string_is_syntactically_valid(r'$\operatorname{Gal}$')However, math_mode_string_is_syntactically_valid returns False if the string has dollar sign delimiters and more than one math mode string is detected in the string (use latex_indices to separate out math mode strings.),
# More than one math mode string is present
assert not math_mode_string_is_syntactically_valid('$hi$ $bye$')
# the math mode delimiter `$` is unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
# the math mode delimiters `$$` and `$` are unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')Tweak a latex string
Sometimes, when autogenerating a latex string through an ML model, some minor formatting eyesores occur, such as a curly bracket { or an underscore _ followed by an unncessary space. We provide some functions to fix such formatting.
reduce_unnecessary_spaces
reduce_unnecessary_spaces (text:str)
Return a string modifying text by removing spaces which are unnecessary for the purposes of considering the string as a LaTeX string.
# It might not be necessary or desirable to eliminate the space before the backslash `\``
test_eq(reduce_unnecessary_spaces(r'something something \ operatorname'), r'something something \operatorname')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res} ^ G_ H (R)'), r'\operatorname{Res}^G_H(R)')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}^{ G}_{ H } (R)'), r'\operatorname{Res}^{G}_{H}(R)')
test_eq(reduce_unnecessary_spaces(r'M_{ f}'), r'M_{f}')
test_eq(reduce_unnecessary_spaces(r'h_{ p}'), r'h_{p}')
test_eq(reduce_unnecessary_spaces(r'\zeta (s)'), r'\zeta(s)')
test_eq(reduce_unnecessary_spaces(r'\mathcal{ H} _{ v}'), r'\mathcal{H}_{v}')Make fixes to summary
fix_autogen_formatting
fix_autogen_formatting (text:str)
Fix some latex formatting issues in an autogenerated text
Currently, the model is inclined to decode and format its summarizations in such a way that creates formatting issues either for LaTeX or Obsidian.md. For example, the model would output a str containing
\ <command_name>instead of\<command_name>{when{is preferable$ <latex_string> $when$<latex_string>$is needed forObsidian.md.
The fix_summary_formatting function attempts to get around some of these issues.
text = r'\ to'
sample_output = fix_autogen_formatting(text)
assert r'\to' in sample_output
text = r'$d\ in\ mathbb{ Z}_{\ geq 0} $'
sample_output = fix_autogen_formatting(text)
assert r'\in' in sample_output
assert r'\mathbb{Z}' in sample_output
assert r'\geq 0' in sample_outputtext = r'There are some extra spaces in this math mode string: $ 5 + 7 = 12 $.'
sample_output = fix_autogen_formatting(text)
print(sample_output)
assert r'$5' in sample_output
assert r'12$' in sample_outputThere are some extra spaces in this math mode string: $5 + 7 = 12$ .
text= r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
sample_output = fix_autogen_formatting(text)
print(sample_output)the group of $G$ -coinvariants of $A$ . It is defined as
$$A_{G} :=A / I_\G} A$$
Correct syntax errors in autogenerated math mode strings
correct_latex_syntax_error
correct_latex_syntax_error (summary:str, replacement_candidates:list[str], syntax_vali dation:Callable[[str],bool]=<function math_mode_string_is_syntactically_valid>)
*Attempt to replace within summary a modified version in which the syntactically incorrect latex math mode strings are replaced with the most closely resembling element of replacement_candidates.
with a modified version in which the latex math mode strings within summary that are syntactically incorrect
TODO: consider the possibility that not all math mode str delimiters are formatted correctly.*
| Type | Default | Details | |
|---|---|---|---|
| summary | str | The autogenerated summary | |
| replacement_candidates | list | A list of candidates to replace. This is expected to be an output of _list_of_candidates_from_math_mode_strings |
|
| syntax_validation | Callable | math_mode_string_is_syntactically_valid | A test to tell whether a math mode string is syntactically valid. |
| Returns | str |
sample_summary = r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
replacement_candidates = [
'A',
'A_',
'A_{G}',
'A_{G}:=A',
'A_{G}:=A',
'A_{G}:=A /',
'A_{G}:=A / I_{G}',
'A_{G}:=A / I_{G} A',
'H_{0}(G, A)',
'H_{0}(G, A) \\simeq',
'H_{0}(G, A) \\simeq A',
'H_{0}(G, A) \\simeq A_',
'H_{0}(G, A) \\simeq A_{G}',
]
test_eq(correct_latex_syntax_error(sample_summary, replacement_candidates), r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G}:=A / I_{G} A$$')
# replacement_candidates