Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' MkDocs documentation content extraction and processing. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from .patterns import THEME_PATTERNS as _THEME_PATTERNS

28from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS

31async def extract_contents(

32 auxdata: __.ApplicationGlobals,

33 source: str,

34 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

35 theme: __.Absential[ str ] = __.absent,

36) -> list[ __.ContentDocument ]:

37 ''' Extracts documentation content for specified objects from MkDocs. '''

38 base_url = __.normalize_base_url( source )

39 if not objects: return [ ]

40 tasks = [

41 _extract_object_documentation(

42 auxdata, base_url, source, obj, theme )

43 for obj in objects ]

44 candidate_results = await __.asyncf.gather_async(

45 *tasks, return_exceptions = True )

46 results: list[ __.ContentDocument ] = [

47 result.value for result in candidate_results

48 if __.generics.is_value( result ) and result.value is not None ]

49 return results

52def parse_mkdocs_html(

53 content: str, element_id: str, url: str, *,

54 theme: __.Absential[ str ] = __.absent

55) -> __.cabc.Mapping[ str, str ]:

56 ''' Parses MkDocs HTML content to extract documentation sections. '''

57 try: soup = _BeautifulSoup( content, 'lxml' )

58 except Exception as exc:

59 raise __.DocumentationParseFailure( element_id, exc ) from exc

60 main_container = _find_main_content_container( soup, theme )

61 if __.is_absent( main_container ):

62 raise __.DocumentationContentAbsence( element_id )

63 target_element = _find_target_element( main_container, element_id )

64 if not target_element:

65 raise __.DocumentationObjectAbsence( element_id, url )

66 description = _extract_content_from_element(

67 target_element, element_id, theme )

68 return {

69 'description': description,

70 'object_name': element_id,

71 }

76def _cleanup_content(

77 content: str,

78 cleanup_selectors: __.cabc.Sequence[ str ]

79) -> str:

80 ''' Removes unwanted elements from content. '''

81 # TODO: Implement more sophisticated cleanup

82 return content

85def _convert_to_markdown( html_content: str ) -> str:

86 ''' Converts HTML content to markdown format using markdownify. '''

87 import markdownify

88 return markdownify.markdownify( html_content, heading_style = 'ATX' )

91def _derive_documentation_url(

92 base_url: __.typx.Any, uri: str, object_name: str

93) -> __.typx.Any:

94 ''' Derives documentation URL from base URL and object URI. '''

95 if uri.endswith( '#$' ):

96 # mkdocstrings pattern - replace #$ with object name anchor

97 clean_uri = uri[ :-2 ]

98 new_path = f"{base_url.path}/{clean_uri}"

99 return base_url._replace( path = new_path, fragment = object_name )

100 if '#' in uri:

101 path_part, fragment = uri.split( '#', 1 )

102 new_path = f"{base_url.path}/{path_part}"

103 return base_url._replace( path = new_path, fragment = fragment )

104 new_path = f"{base_url.path}/{uri}"

105 return base_url._replace( path = new_path, fragment = object_name )

106

107

108def _extract_content_from_element(

109 element: __.typx.Any,

110 element_id: str,

111 theme: __.Absential[ str ] = __.absent

112) -> str:

113 ''' Extracts description content using universal patterns. '''

114 description = _extract_description( element )

115 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][

116 'universal_selectors'

117 ]

118 return _cleanup_content( description, cleanup_selectors )

119

120

121def _extract_description( element: __.typx.Any ) -> str:

122 ''' Extracts description content from element. '''

123 doc_contents = _find_doc_contents_container( element )

124 if doc_contents:

125 return doc_contents.decode_contents( )

126 return ''

127

128

129async def _extract_object_documentation(

130 auxdata: __.ApplicationGlobals,

131 base_url: __.typx.Any,

132 location: str,

133 obj: __.InventoryObject,

134 theme: __.Absential[ str ] = __.absent,

135) -> __.ContentDocument | None:

136 ''' Extracts documentation for a single object from MkDocs site. '''

137 doc_url = _derive_documentation_url(

138 base_url, obj.uri, obj.name )

139 try:

140 html_content = (

141 await __.retrieve_url_as_text(

142 auxdata.content_cache, doc_url ) )

143 except Exception as exc:

144 __.acquire_scribe( __name__ ).debug(

145 "Failed to retrieve %s: %s", doc_url, exc )

146 return None

147 anchor = doc_url.fragment or str( obj.name )

148 try:

149 parsed_content = parse_mkdocs_html(

150 html_content, anchor, str( doc_url ), theme = theme )

151 except Exception: return None

152 description = _convert_to_markdown( parsed_content[ 'description' ] )

153 content_id = __.produce_content_id( location, obj.name )

154 return __.ContentDocument(

155 inventory_object = obj,

156 content_id = content_id,

157 description = description,

158 documentation_url = doc_url.geturl( ),

159 extraction_metadata = __.immut.Dictionary( {

160 'theme': theme if not __.is_absent( theme ) else 'unknown',

161 'extraction_method': 'mkdocs_html_parsing',

162 'relevance_score': 1.0,

163 'match_reasons': [ 'direct extraction' ],

164 } )

165 )

171def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:

172 ''' Finds the doc-contents container for the element. '''

173 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):

174 sibling = element.next_sibling

175 while sibling:

176 if (

177 hasattr( sibling, 'get' ) and sibling.name == 'div' and

178 'doc-contents' in sibling.get( 'class', [ ] )

179 ): return sibling

180 sibling = sibling.next_sibling

181 return element.select_one( '.doc-contents' )

182

183

184def _find_target_element(

185 container: __.typx.Any, element_id: str

186) -> __.typx.Any:

187 ''' Finds target element within main container using ID strategies. '''

188 target = container.find( id = element_id )

189 if target: return target

190 target = container.find( attrs = { 'data-toc-label': element_id } )

191 if target: return target

192 for heading in container.find_all(

193 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):

194 if element_id in heading.get_text( ):

195 return heading

196 for section in container.find_all( 'section' ):

197 class_attr = section.get( 'class' )

198 if class_attr and element_id in ' '.join( class_attr ):

199 return section

200 return container

201

202

203def _find_main_content_container(

204 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

205) -> __.Absential[ __.typx.Any ]:

206 ''' Finds main content container trying theme-specific patterns first. '''

207 if (

208 not __.is_absent( theme )

209 and theme in _THEME_PATTERNS[ 'content_containers' ]

210 ):

211 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][

212 theme

213 ]

214 for selector in theme_selectors:

215 container = soup.select_one( selector )

216 if container: return container

217 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ]

218 universal_selectors = content_config[ 'universal_selectors' ]

219 for selector in universal_selectors:

220 container = soup.select_one( selector )

221 if container: return container

222 return __.absent