helper.latex

Helper functions for latex functionalities

We remark that many of the functions in this module are AI generated or assisted.

from fastcore.test import *

Validity of latex syntax

Test latex syntax

We require some functions to evaluate whether a latex math mode string is syntactically valid.

assert not _has_invalid_left_right_bracket(r"\left( x \right)")
assert not _has_invalid_left_right_bracket(r"\left[ x \right]")
assert not _has_invalid_left_right_bracket(r"\left\{ x \right\}")
assert not _has_invalid_left_right_bracket(r"\left< x \right>")
assert not _has_invalid_left_right_bracket(r"\left| x \right|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert not _has_invalid_left_right_bracket(r"\left\| x \right\|")
assert _has_invalid_left_right_bracket(r"\lefta x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \righta")
assert _has_invalid_left_right_bracket(r"\left x \right)")
assert _has_invalid_left_right_bracket(r"\left( x \right x")
assert _has_invalid_left_right_bracket(r"\left\backslash x \right/")
assert not _has_invalid_left_right_bracket(r"x + y")
assert _has_invalid_left_right_bracket(r"\left\\")
assert _has_invalid_left_right_bracket(r"\right\\")

pattern = regex_pattern_detecting_command(('Sur', 0, None, r'\mathrm{Sur}'))
text = r'The number of element of $\Sur(\operatorname{Cl} \mathcal{O}_L, A)$ is ...'
match = pattern.search(text)
start, end = match.span()
test_eq(text[start:end], r'\Sur')

extract_commands_from_nodes

 extract_commands_from_nodes (commands:list[str],
                              nodes:list[pylatexenc.latexwalker.LatexNode]
                              )

This is a helper function to extract_latex_commands.

extract_latex_commands

 extract_latex_commands (latex_string)

# Example usage
assert extract_latex_commands(r"\frac{a}{b}") == ['frac']
assert extract_latex_commands(r"$\frac{a}{b}$") == ['frac']
assert extract_latex_commands(r"\sqrt[n]{x}") == ['sqrt']
assert extract_latex_commands(r"\binom{n}{k}") == ['binom']
assert extract_latex_commands(r"x^2 + y^2") == []  # No commands, just variables
assert extract_latex_commands(r"\overset{a}{b}") == ['overset']

# Additional tests
assert extract_latex_commands(r"\sum_{i=1}^{n} i") == ['sum']
assert extract_latex_commands(r"\int_{0}^{\infty} e^{-x} dx") == ['int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x)") == ['lim', 'to']
assert extract_latex_commands(r"\prod_{i=1}^{n} i") == ['prod']
assert extract_latex_commands(r"\text{Hello} + \frac{1}{2}") == ['text', 'frac']

# Multiple commands in one string
assert extract_latex_commands(r"\frac{a}{b} + \sqrt{c} + \binom{n}{k}") == ['frac', 'sqrt', 'binom']
assert extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx") == ['sum', 'int', 'infty']
assert extract_latex_commands(r"\lim_{x \to 0} f(x) = \frac{1}{x}") == ['lim', 'to', 'frac']
assert extract_latex_commands(r"\overset{a}{b} + \underset{c}{d}") == ['overset', 'underset']
assert extract_latex_commands(r"\text{This is } \textbf{bold} + \textit{italic} + \frac{1}{2}") == ['text', 'textbf', 'textit', 'frac']

# Complex expressions
test_eq(extract_latex_commands(r"\frac{\sum_{i=1}^{n} i}{n} = \frac{n(n+1)}{2}"), ['frac', 'sum', 'frac'])
test_eq(extract_latex_commands(r"\int_{0}^{1} x^2 \, dx = \frac{1}{3}"), ['int', ',', 'frac'])
assert extract_latex_commands(r"\sqrt{\frac{a}{b}} + \binom{n}{k}") == ['sqrt', 'frac', 'binom']

# Incorrect synntax
assert extract_latex_commands(r"\frac{}}") == ['frac']
assert extract_latex_commands(r"\frac{a}{b}{c}") == ['frac']  # Extra argument
assert extract_latex_commands(r"\frac{a}{b + \frac{c}{d}}") == ['frac', 'frac']  # Nested command
test_eq(extract_latex_commands(r"\sum_{i=1}^{n} i + \int_{0}^{\infty} e^{-x} dx = \frac{1}{2}"), ['sum', 'int', 'infty', 'frac'])
# Comment
assert extract_latex_commands(r"%hi") == []
# Environment Node
test_eq(extract_latex_commands(r"\begin{align} \end{align}"), ['begin', 'end'])
test_eq(extract_latex_commands(r"\begin{align}"), ['begin'])
# test_eq(extract_latex_commands(r"\ begin{align} \end{align}"), [' '])

test_eq(extract_latex_commands(r'\text\in'), ['text', 'in'])

detect_incorrect_latex_commands

 detect_incorrect_latex_commands (latex_string:str)

*Return True if there is at least one syntactically incorrect use of a latex command detected in latex_string.

This is a helper function to math_mode_string_is_syntactically_valid.*

detect_unbalanced_environments

 detect_unbalanced_environments (latex_string:str)

# Example usage
latex_code = r"""
\begin{document}
This is a sample document.
\begin{itemize}
    \item First item
    \begin{enumerate}
        \item First sub-item
    \end{enumerate}
    \item Second item
\end{itemize}
\end{document}
\begin{wrongenv}  % This environment is unmatched
"""

# Detect unbalanced environments
unbalanced = detect_unbalanced_environments(latex_code)

# Print the results
# if unbalanced:
#     print("Unbalanced environments detected:")
#     for error in unbalanced:
#         print(error)
# else:
#     print("All environments are balanced.")

assert unbalanced


latex_code = r"""
\begin{document}
This is a sample document.
\begin{itemize}
    \item First item
    \begin{enumerate}
        \item First sub-item
    \end{enumerate}
    \item Second item
\end{itemize}
\end{document}
"""
# Detect unbalanced environments
unbalanced = detect_unbalanced_environments(latex_code)
assert not unbalanced

math_mode_string_is_syntactically_valid

 math_mode_string_is_syntactically_valid (text:str)

*Return True if text is determined to be syntactically valid as a latex str.

There may be TeX syntax rules beyond the scope of this function.

Some caveats:

text is allowed to have dollar signs $ and is also allowed to not have dollar signs. Even if text does not have dollar signs, this function may return True. Even if text has dollar signs, this function may return False if the entire string is not a singular math mode string or if the dollar signs are not used in a math-mode-valid way.*

assert not math_mode_string_is_syntactically_valid(r'$$n=p_1^{e_1} p_2^{e_2} \cdots p_k^$$')
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$ $')
assert math_mode_string_is_syntactically_valid(r'hi')
assert math_mode_string_is_syntactically_valid(r'$hi$')
assert not math_mode_string_is_syntactically_valid(r'$hi$$')
assert math_mode_string_is_syntactically_valid(r'$\\dim ^ a$')
assert not math_mode_string_is_syntactically_valid(r'{ hi')
assert math_mode_string_is_syntactically_valid(r'\{ hi')
assert math_mode_string_is_syntactically_valid(r'\ [')
assert math_mode_string_is_syntactically_valid(r'\left( \right.')
assert not math_mode_string_is_syntactically_valid(r'\left \right.')
assert math_mode_string_is_syntactically_valid(r'$$\left|\sum_{i=0} \right|$$')
assert math_mode_string_is_syntactically_valid(r'$\\\$$')
assert not math_mode_string_is_syntactically_valid(r'\begin{enumerate}')
assert math_mode_string_is_syntactically_valid(r'\begin{enumerate} asdf \end{enumerate}')
assert not math_mode_string_is_syntactically_valid(r'$$R=\sum_P\in X\operatorname length\left(\Omega__X / Y\right)_p\cdot P$$')
# TODO there is something to be considered here; the below
# example would be a syntax error, and yet the functions  implemented
# above don't really detect as such.
# assert not detect_incorrect_latex_commands(r'\sideset{_1^2}{_3^4}')

math_mode_string_is_syntactically_valid(r'\text\in')

False

The math_mode_string_is_syntactically_valid experimentally assesses whether a given math mode LaTeX string is syntactically valid. In principal, this should mean that a LaTeX syntax error caused by the string should be detected by the function.

TODO: consider the following to :

Unescaped % sign (starts a comment): $x = 50% of y$

Using ! (negative space) at the beginning of math mode: $\!x + y$

The following lists some example outputs of the math_mode_string_is_syntactically_valid function along with explanations.

Unmatched curly braces are a common syntactical error:

assert not math_mode_string_is_syntactically_valid(r'\sqrt{x}}')

However, using \{ or \} does not count towards curly bracket matching:

assert math_mode_string_is_syntactically_valid(r'\{hi')

On the other hand, a backslash \ followed by spaces and then followed by a curly bracket is in itself an invalid syntax.

assert not math_mode_string_is_syntactically_valid(r'\ {hi')

math_mode_string_is_syntactically_valid will consider the validity of a string whether or not the string has math mode delimiters.

assert math_mode_string_is_syntactically_valid(r'\operatorname{Gal}')
assert math_mode_string_is_syntactically_valid(r'$\operatorname{Gal}$')

However, math_mode_string_is_syntactically_valid returns False if the string has dollar sign delimiters and more than one math mode string is detected in the string (use latex_indices to separate out math mode strings.),

# More than one math mode string is present
assert not math_mode_string_is_syntactically_valid('$hi$ $bye$')
# the math mode delimiter `$` is unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$x^2 + y^2')
# the math mode delimiters `$$` and `$` are unbalanced.
assert not math_mode_string_is_syntactically_valid(r'$$x^2 + y^2$')

Tweak a latex string

Sometimes, when autogenerating a latex string through an ML model, some minor formatting eyesores occur, such as a curly bracket { or an underscore _ followed by an unncessary space. We provide some functions to fix such formatting.

reduce_unnecessary_spaces

 reduce_unnecessary_spaces (text:str)

Return a string modifying text by removing spaces which are unnecessary for the purposes of considering the string as a LaTeX string.

# It might not be necessary or desirable to eliminate the space before the backslash `\``
test_eq(reduce_unnecessary_spaces(r'something something \  operatorname'), r'something something \operatorname')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}  ^ G_ H (R)'), r'\operatorname{Res}^G_H(R)')
test_eq(reduce_unnecessary_spaces(r'\operatorname{Res}^{ G}_{ H } (R)'), r'\operatorname{Res}^{G}_{H}(R)')
test_eq(reduce_unnecessary_spaces(r'M_{ f}'), r'M_{f}')
test_eq(reduce_unnecessary_spaces(r'h_{ p}'), r'h_{p}')
test_eq(reduce_unnecessary_spaces(r'\zeta (s)'), r'\zeta(s)')
test_eq(reduce_unnecessary_spaces(r'\mathcal{ H} _{ v}'), r'\mathcal{H}_{v}')

Make fixes to summary

fix_autogen_formatting

 fix_autogen_formatting (text:str)

Fix some latex formatting issues in an autogenerated text

Currently, the model is inclined to decode and format its summarizations in such a way that creates formatting issues either for LaTeX or Obsidian.md. For example, the model would output a str containing

\ <command_name> instead of \<command_name>
{ when { is preferable
$ <latex_string> $ when $<latex_string>$ is needed for Obsidian.md.

The fix_summary_formatting function attempts to get around some of these issues.

text = r'\ to'
sample_output = fix_autogen_formatting(text)
assert r'\to' in sample_output

text = r'$d\ in\ mathbb{ Z}_{\ geq 0} $'
sample_output = fix_autogen_formatting(text)
assert r'\in' in sample_output
assert r'\mathbb{Z}' in sample_output
assert r'\geq 0' in sample_output

text = r'There are some extra spaces in this math mode string: $  5 + 7 = 12 $.'
sample_output = fix_autogen_formatting(text)
print(sample_output)
assert r'$5' in sample_output
assert r'12$' in sample_output

There are some extra spaces in this math mode string: $5 + 7 = 12$ .

text=  r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
sample_output = fix_autogen_formatting(text)
print(sample_output)

the group of $G$ -coinvariants of $A$ . It is defined as 

$$A_{G} :=A / I_\G} A$$

Correct syntax errors in autogenerated math mode strings

correct_latex_syntax_error

 correct_latex_syntax_error (summary:str,
                             replacement_candidates:list[str], syntax_vali
                             dation:Callable[[str],bool]=<function
                             math_mode_string_is_syntactically_valid>)

*Attempt to replace within summary a modified version in which the syntactically incorrect latex math mode strings are replaced with the most closely resembling element of replacement_candidates.

with a modified version in which the latex math mode strings within summary that are syntactically incorrect

TODO: consider the possibility that not all math mode str delimiters are formatted correctly.*

	Type	Default	Details
summary	str		The autogenerated summary
replacement_candidates	list		A list of candidates to replace. This is expected to be an output of `_list_of_candidates_from_math_mode_strings`
syntax_validation	Callable	math_mode_string_is_syntactically_valid	A test to tell whether a math mode string is syntactically valid.
Returns	str

sample_summary = r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G} :=A / I_\G} A$$'
replacement_candidates = [
    'A',
    'A_',
    'A_{G}',
    'A_{G}:=A',
    'A_{G}:=A',
    'A_{G}:=A /',
    'A_{G}:=A / I_{G}',
    'A_{G}:=A / I_{G} A',
    'H_{0}(G, A)',
    'H_{0}(G, A) \\simeq',
    'H_{0}(G, A) \\simeq A',
    'H_{0}(G, A) \\simeq A_',
    'H_{0}(G, A) \\simeq A_{G}',
]
test_eq(correct_latex_syntax_error(sample_summary, replacement_candidates), r'the group of $G$-coinvariants of $A$. It is defined as $$A_{G}:=A / I_{G} A$$')
# replacement_candidates