Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

94 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-20 18:40 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' MkDocs documentation content extraction and processing. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from .patterns import THEME_PATTERNS as _THEME_PATTERNS 

28from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS 

29 

30 

31async def extract_contents( 

32 auxdata: __.ApplicationGlobals, 

33 source: str, 

34 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

35 theme: __.Absential[ str ] = __.absent, 

36) -> list[ __.ContentDocument ]: 

37 ''' Extracts documentation content for specified objects from MkDocs. ''' 

38 base_url = __.normalize_base_url( source ) 

39 if not objects: return [ ] 

40 tasks = [ 

41 _extract_object_documentation( 

42 auxdata, base_url, source, obj, theme ) 

43 for obj in objects ] 

44 candidate_results = await __.asyncf.gather_async( 

45 *tasks, return_exceptions = True ) 

46 results: list[ __.ContentDocument ] = [ 

47 result.value for result in candidate_results 

48 if __.generics.is_value( result ) and result.value is not None ] 

49 return results 

50 

51 

52def parse_mkdocs_html( 

53 content: str, element_id: str, url: str, *, 

54 theme: __.Absential[ str ] = __.absent 

55) -> __.cabc.Mapping[ str, str ]: 

56 ''' Parses MkDocs HTML content to extract documentation sections. ''' 

57 try: soup = _BeautifulSoup( content, 'lxml' ) 

58 except Exception as exc: 

59 raise __.DocumentationParseFailure( element_id, exc ) from exc 

60 main_container = _find_main_content_container( soup, theme ) 

61 if __.is_absent( main_container ): 

62 raise __.DocumentationContentAbsence( element_id ) 

63 target_element = _find_target_element( main_container, element_id ) 

64 if not target_element: 

65 raise __.DocumentationObjectAbsence( element_id, url ) 

66 description = _extract_content_from_element( 

67 target_element, element_id, theme ) 

68 return { 

69 'description': description, 

70 'object_name': element_id, 

71 } 

72 

73 

74 

75 

76def _cleanup_content( 

77 content: str, 

78 cleanup_selectors: __.cabc.Sequence[ str ] 

79) -> str: 

80 ''' Removes unwanted elements from content. ''' 

81 # TODO: Implement more sophisticated cleanup 

82 return content 

83 

84 

85def _convert_to_markdown( html_content: str ) -> str: 

86 ''' Converts HTML content to markdown format using markdownify. ''' 

87 import markdownify 

88 return markdownify.markdownify( html_content, heading_style = 'ATX' ) 

89 

90 

91def _derive_documentation_url( 

92 base_url: __.typx.Any, uri: str, object_name: str 

93) -> __.typx.Any: 

94 ''' Derives documentation URL from base URL and object URI. ''' 

95 if uri.endswith( '#$' ): 

96 # mkdocstrings pattern - replace #$ with object name anchor 

97 clean_uri = uri[ :-2 ] 

98 new_path = f"{base_url.path}/{clean_uri}" 

99 return base_url._replace( path = new_path, fragment = object_name ) 

100 if '#' in uri: 

101 path_part, fragment = uri.split( '#', 1 ) 

102 new_path = f"{base_url.path}/{path_part}" 

103 return base_url._replace( path = new_path, fragment = fragment ) 

104 new_path = f"{base_url.path}/{uri}" 

105 return base_url._replace( path = new_path, fragment = object_name ) 

106 

107 

108def _extract_content_from_element( 

109 element: __.typx.Any, 

110 element_id: str, 

111 theme: __.Absential[ str ] = __.absent 

112) -> str: 

113 ''' Extracts description content using universal patterns. ''' 

114 description = _extract_description( element ) 

115 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][ 

116 'universal_selectors' 

117 ] 

118 return _cleanup_content( description, cleanup_selectors ) 

119 

120 

121def _extract_description( element: __.typx.Any ) -> str: 

122 ''' Extracts description content from element. ''' 

123 doc_contents = _find_doc_contents_container( element ) 

124 if doc_contents: 

125 return doc_contents.decode_contents( ) 

126 return '' 

127 

128 

129async def _extract_object_documentation( 

130 auxdata: __.ApplicationGlobals, 

131 base_url: __.typx.Any, 

132 location: str, 

133 obj: __.InventoryObject, 

134 theme: __.Absential[ str ] = __.absent, 

135) -> __.ContentDocument | None: 

136 ''' Extracts documentation for a single object from MkDocs site. ''' 

137 doc_url = _derive_documentation_url( 

138 base_url, obj.uri, obj.name ) 

139 try: 

140 html_content = ( 

141 await __.retrieve_url_as_text( 

142 auxdata.content_cache, doc_url ) ) 

143 except Exception as exc: 

144 __.acquire_scribe( __name__ ).debug( 

145 "Failed to retrieve %s: %s", doc_url, exc ) 

146 return None 

147 anchor = doc_url.fragment or str( obj.name ) 

148 try: 

149 parsed_content = parse_mkdocs_html( 

150 html_content, anchor, str( doc_url ), theme = theme ) 

151 except Exception: return None 

152 description = _convert_to_markdown( parsed_content[ 'description' ] ) 

153 content_id = __.produce_content_id( location, obj.name ) 

154 return __.ContentDocument( 

155 inventory_object = obj, 

156 content_id = content_id, 

157 description = description, 

158 documentation_url = doc_url.geturl( ), 

159 extraction_metadata = __.immut.Dictionary( { 

160 'theme': theme if not __.is_absent( theme ) else 'unknown', 

161 'extraction_method': 'mkdocs_html_parsing', 

162 'relevance_score': 1.0, 

163 'match_reasons': [ 'direct extraction' ], 

164 } ) 

165 ) 

166 

167 

168 

169 

170 

171def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None: 

172 ''' Finds the doc-contents container for the element. ''' 

173 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ): 

174 sibling = element.next_sibling 

175 while sibling: 

176 if ( 

177 hasattr( sibling, 'get' ) and sibling.name == 'div' and 

178 'doc-contents' in sibling.get( 'class', [ ] ) 

179 ): return sibling 

180 sibling = sibling.next_sibling 

181 return element.select_one( '.doc-contents' ) 

182 

183 

184def _find_target_element( 

185 container: __.typx.Any, element_id: str 

186) -> __.typx.Any: 

187 ''' Finds target element within main container using ID strategies. ''' 

188 target = container.find( id = element_id ) 

189 if target: return target 

190 target = container.find( attrs = { 'data-toc-label': element_id } ) 

191 if target: return target 

192 for heading in container.find_all( 

193 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ): 

194 if element_id in heading.get_text( ): 

195 return heading 

196 for section in container.find_all( 'section' ): 

197 class_attr = section.get( 'class' ) 

198 if class_attr and element_id in ' '.join( class_attr ): 

199 return section 

200 return container 

201 

202 

203def _find_main_content_container( 

204 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

205) -> __.Absential[ __.typx.Any ]: 

206 ''' Finds main content container trying theme-specific patterns first. ''' 

207 if ( 

208 not __.is_absent( theme ) 

209 and theme in _THEME_PATTERNS[ 'content_containers' ] 

210 ): 

211 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][ 

212 theme 

213 ] 

214 for selector in theme_selectors: 

215 container = soup.select_one( selector ) 

216 if container: return container 

217 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ] 

218 universal_selectors = content_config[ 'universal_selectors' ] 

219 for selector in universal_selectors: 

220 container = soup.select_one( selector ) 

221 if container: return container 

222 return __.absent