|
1 | | -#!/usr/bin/python3 |
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +#NOTE: most of this code written with qwen3-coder:30b |
| 4 | + |
| 5 | +import os |
2 | 6 | import requests |
3 | | -from lxml import etree |
| 7 | +from pathlib import Path |
4 | 8 | import json |
5 | | -import sys |
6 | | -def get_package_info_from_upstream(distro, package_name): |
7 | | - if distro == 'debian': |
8 | | - distro_url = "https://packages.debian.org/search?keywords=" + package_name + "&searchon=names&suite=all§ion=all" |
9 | | - elif distro == 'ubuntu': |
10 | | - distro_url = "https://packages.ubuntu.com/search?keywords=" + package_name + "&searchon=names&suite=all§ion=all" |
| 9 | +import re |
| 10 | +import gzip |
| 11 | +from urllib.parse import urljoin |
| 12 | + |
| 13 | +def get_debian_release_names(cache_dir="./debian_cache"): |
| 14 | + """ |
| 15 | + Get Debian release names from the README file |
| 16 | + """ |
| 17 | + # Create cache directory if it doesn't exist |
| 18 | + Path(cache_dir).mkdir(exist_ok=True) |
| 19 | + |
| 20 | + # Build URL |
| 21 | + readme_url = "http://deb.debian.org/debian/dists/README" |
| 22 | + readme_path = os.path.join(cache_dir, "README") |
| 23 | + |
| 24 | + # Check if we already have the README file |
| 25 | + if os.path.exists(readme_path): |
| 26 | + print(f"Using cached README: {readme_path}") |
| 27 | + with open(readme_path, 'r') as f: |
| 28 | + readme_content = f.read() |
11 | 29 | else: |
12 | | - print("invalid distro %s, quit" % distro) |
13 | | - sys.exit(1) |
14 | | - # Step 1: Fetch HTML content from the URL |
15 | | - response = requests.get(distro_url) |
16 | | - html_content = response.content # Use .content for lxml to handle byte data |
17 | | - # Step 2: Parse HTML with lxml |
18 | | - parser = etree.HTMLParser() |
19 | | - tree = etree.fromstring(html_content, parser) |
20 | | - # Step 3: Extract data |
21 | | - for h3 in tree.xpath('//h3'): |
22 | | - section_title = h3.text |
23 | | - ul = h3.xpath('./following-sibling::ul[1]') |
24 | | - debian_all_package_info = {} |
25 | | - if ul: |
26 | | - list_items = ul[0].xpath('.//li') |
27 | | - for li in list_items: |
28 | | - debian_package_info = {} |
29 | | - item_text = li.xpath('.//text()[not(parent::a)]') |
30 | | - item_class = li.get("class") |
31 | | - package_file_release = item_class |
32 | | - package_file_version = item_text[1].split(":")[0] |
33 | | - architectures = ["arm64", "armhf", "amd64", "riscv64", "loong64"] |
34 | | - arch_info = item_text[1].split(":")[1] |
35 | | - for arch in architectures: |
36 | | - if arch in arch_info: |
37 | | - package_filename = f"{package_name}_{package_file_version}_{arch}.deb" |
38 | | - debian_package_info[arch] = package_filename |
39 | | - debian_all_package_info[item_class] = debian_package_info |
40 | | - return debian_all_package_info |
41 | | -if len(sys.argv) < 2: |
42 | | - print("Usage: python parse.py <package_name>") |
43 | | - sys.exit(1) |
44 | | -package_name = sys.argv[1] |
45 | | -debian_info = get_package_info_from_upstream("debian", package_name) |
46 | | -ubuntu_info = get_package_info_from_upstream("ubuntu", package_name) |
47 | | -if debian_info and ubuntu_info: |
48 | | - all_info_result = {**debian_info, **ubuntu_info} |
49 | | - json_file_name = package_name + ".json" |
| 30 | + print("Downloading README...") |
| 31 | + response = requests.get(readme_url, timeout=30) |
| 32 | + response.raise_for_status() |
| 33 | + readme_content = response.text |
| 34 | + |
| 35 | + # Save to cache |
| 36 | + with open(readme_path, 'w') as f: |
| 37 | + f.write(readme_content) |
| 38 | + |
| 39 | + # Extract release names using regex |
| 40 | + # Pattern: \S+, or (\S+)\s+ - matches "oldstable, or bookworm" and captures "bookworm" |
| 41 | + release_pattern = r'\S+, or (\S+)\s+' |
| 42 | + |
| 43 | + releases = [] |
| 44 | + for line in readme_content.split('\n'): |
| 45 | + if line.strip(): |
| 46 | + match = re.search(release_pattern, line) |
| 47 | + if match: |
| 48 | + release_name = match.group(1) |
| 49 | + releases.append(f"debian/{release_name}") |
| 50 | + print(f"Found release: {release_name}") |
| 51 | + |
| 52 | + return releases |
| 53 | + |
| 54 | +def get_debian_architectures(distro, release_name, cache_dir="./debian_cache"): |
| 55 | + """ |
| 56 | + Get supported architectures for a Debian release from InRelease file |
| 57 | + """ |
| 58 | + # Create cache directory if it doesn't exist |
| 59 | + Path(cache_dir).mkdir(exist_ok=True) |
| 60 | + |
| 61 | + # Build URLs |
| 62 | + match distro: |
| 63 | + case 'debian': |
| 64 | + base_url = "http://deb.debian.org/debian" |
| 65 | + case 'ubuntu': |
| 66 | + base_url = "http://archive.ubuntu.com/ubuntu" |
| 67 | + inrelease_url = f"{base_url}/dists/{release_name}/InRelease" |
| 68 | + inrelease_path = os.path.join(cache_dir, f"{release_name}_InRelease") |
| 69 | + |
| 70 | + # Check if we already have the file |
| 71 | + if os.path.exists(inrelease_path): |
| 72 | + #print(f"Using cached file: {inrelease_path}") |
| 73 | + with open(inrelease_path, 'r') as f: |
| 74 | + inrelease_content = f.read() |
| 75 | + else: |
| 76 | + #print(f"Downloading InRelease for {release_name}...") |
| 77 | + response = requests.get(inrelease_url, timeout=30) |
| 78 | + response.raise_for_status() |
| 79 | + inrelease_content = response.text |
| 80 | + |
| 81 | + # Save to cache |
| 82 | + with open(inrelease_path, 'w') as f: |
| 83 | + f.write(inrelease_content) |
| 84 | + |
| 85 | + # Extract architectures from the InRelease file |
| 86 | + # Look for the "Architectures:" line |
| 87 | + architectures = [] |
| 88 | + |
| 89 | + # Split by lines and look for architectures |
| 90 | + for line in inrelease_content.split('\n'): |
| 91 | + if line.lower().startswith('architectures:'): |
| 92 | + # Extract architectures after the colon |
| 93 | + arch_line = line.split(':', 1)[1].strip() |
| 94 | + architectures = [arch.strip() for arch in arch_line.split() if arch.strip()] |
| 95 | + break |
| 96 | + |
| 97 | + if architectures: |
| 98 | + print(f"Supported architectures for {release_name}: {architectures}") |
| 99 | + if('all' in architectures): |
| 100 | + architectures.remove('all') |
| 101 | + return architectures |
| 102 | + else: |
| 103 | + print("Could not find Architectures field in InRelease file") |
| 104 | + return [] |
| 105 | + |
| 106 | +def get_debian_srcpkg_architecture(distro, release_name, package_name, cache_dir="./debian_cache"): |
| 107 | + """ |
| 108 | + Get the synthesized package filename for a given package in a Debian release |
| 109 | + """ |
| 110 | + # Create cache directory if it doesn't exist |
| 111 | + Path(cache_dir).mkdir(exist_ok=True) |
| 112 | + |
| 113 | + # Build URLs |
| 114 | + match distro: |
| 115 | + case 'debian': |
| 116 | + base_url = "http://deb.debian.org/debian" |
| 117 | + case 'ubuntu': |
| 118 | + #base_url = "http://archive.ubuntu.com/ubuntu" |
| 119 | + base_url = "http://ports.ubuntu.com/" |
| 120 | + |
| 121 | + sources_url = f"{base_url}/dists/{release_name}/main/source/Sources.gz" |
| 122 | + sources_path = os.path.join(cache_dir, f"{release_name}_Sources.gz") |
| 123 | + |
| 124 | + # Check if we already have the Sources.gz file |
| 125 | + if os.path.exists(sources_path): |
| 126 | + print(f"Using cached Sources.gz: {sources_path}") |
| 127 | + else: |
| 128 | + print(f"Downloading Sources.gz for {release_name}...") |
| 129 | + response = requests.get(sources_url, timeout=30) |
| 130 | + response.raise_for_status() |
| 131 | + |
| 132 | + # Save to cache |
| 133 | + with open(sources_path, 'wb') as f: |
| 134 | + f.write(response.content) |
| 135 | + |
| 136 | + # Decompress and read |
| 137 | + with gzip.open(sources_path, 'rt') as f: |
| 138 | + sources_content = f.read() |
| 139 | + |
| 140 | + # Parse the Sources file to find the package |
| 141 | + package_info = parse_sources_for_package(sources_content, package_name) |
| 142 | + |
| 143 | + if package_info: |
| 144 | + return package_info['architecture'] |
| 145 | + else: |
| 146 | + raise FileNotFoundError(f"Package '{package_name}' not found in {distro}/{release_name} Sources.gz") |
| 147 | + |
| 148 | +def parse_sources_for_package(sources_content, package_name): |
| 149 | + """ |
| 150 | + Parse Sources.gz content to find package information |
| 151 | + """ |
| 152 | + # Split into individual package entries |
| 153 | + packages = sources_content.split('\n\n') |
| 154 | + |
| 155 | + for package_entry in packages: |
| 156 | + if not package_entry.strip(): |
| 157 | + continue |
| 158 | + |
| 159 | + package_info = {} |
| 160 | + for line in package_entry.split('\n'): |
| 161 | + if ':' in line: |
| 162 | + key, value = line.split(':', 1) |
| 163 | + package_info[key.strip().lower()] = value.strip() |
| 164 | + |
| 165 | + # Check if this is our package |
| 166 | + if package_info.get('package', '').lower() == package_name.lower(): |
| 167 | + return package_info |
| 168 | + |
| 169 | + return None |
| 170 | + |
| 171 | +def get_debian_binary_package_filename(distro, release_name, package_name, architecture='arm64', cache_dir="./debian_cache"): |
| 172 | + """ |
| 173 | + Get the binary package filename for a given package in a Debian release |
| 174 | + This is more complex because we need to parse Packages files |
| 175 | + """ |
| 176 | + # Create cache directory if it doesn't exist |
| 177 | + Path(cache_dir).mkdir(exist_ok=True) |
| 178 | + |
| 179 | + # Build URLs for Packages file |
| 180 | + match distro: |
| 181 | + case 'debian': |
| 182 | + if( architecture == 'loong64' ): |
| 183 | + base_url = "http://ftp.ports.debian.org/debian-ports/" |
| 184 | + else: |
| 185 | + base_url = "http://ftp.debian.org/debian/" |
| 186 | + case 'ubuntu': |
| 187 | + if(re.match("(i386|amd64)", architecture)): #regex as there is amd64 and amd64v3 |
| 188 | + base_url = "http://archive.ubuntu.com/ubuntu" |
| 189 | + else: |
| 190 | + base_url = "http://ports.ubuntu.com/" |
| 191 | + packages_url = f"{base_url}/dists/{release_name}/main/binary-{architecture}/Packages.gz" |
| 192 | + packages_path = os.path.join(cache_dir, f"{release_name}_{architecture}_Packages.gz") |
| 193 | + |
| 194 | + # Check if we already have the Packages.gz file |
| 195 | + if os.path.exists(packages_path): |
| 196 | + print(f"Using cached Packages.gz: {packages_path}") |
| 197 | + else: |
| 198 | + print(f"Downloading Packages.gz for {release_name} ({architecture})...") |
| 199 | + response = requests.get(packages_url, timeout=30) |
| 200 | + response.raise_for_status() |
| 201 | + |
| 202 | + # Save to cache |
| 203 | + with open(packages_path, 'wb') as f: |
| 204 | + f.write(response.content) |
| 205 | + |
| 206 | + # Decompress and read |
| 207 | + with gzip.open(packages_path, 'rt') as f: |
| 208 | + packages_content = f.read() |
| 209 | + |
| 210 | + # Parse the Packages file to find the package |
| 211 | + package_info = parse_packages_for_package(packages_content, package_name) |
| 212 | + |
| 213 | + if package_info: |
| 214 | + # Synthesize the package filename |
| 215 | + filename = synthesize_binary_package_filename(package_info) |
| 216 | + #print(f"Synthesized binary package filename: {filename}") |
| 217 | + return filename |
| 218 | + else: |
| 219 | + print(f"Binary package '{package_name}' not found for {architecture}/Packages.gz") |
| 220 | + return None |
| 221 | + |
| 222 | +def parse_packages_for_package(packages_content, package_name): |
| 223 | + """ |
| 224 | + Parse Packages.gz content to find package information |
| 225 | + """ |
| 226 | + # Split into individual package entries |
| 227 | + packages = packages_content.split('\n\n') |
| 228 | + |
| 229 | + for package_entry in packages: |
| 230 | + if not package_entry.strip(): |
| 231 | + continue |
| 232 | + |
| 233 | + package_info = {} |
| 234 | + for line in package_entry.split('\n'): |
| 235 | + if ':' in line: |
| 236 | + key, value = line.split(':', 1) |
| 237 | + package_info[key.strip().lower()] = value.strip() |
| 238 | + |
| 239 | + # Check if this is our package |
| 240 | + if package_info.get('package', '').lower() == package_name.lower(): |
| 241 | + return package_info |
| 242 | + |
| 243 | + return None |
| 244 | + |
| 245 | +def synthesize_binary_package_filename(package_info): |
| 246 | + """ |
| 247 | + Synthesize the Debian binary package filename from package info |
| 248 | + """ |
| 249 | + # Extract needed fields |
| 250 | + package = package_info.get('package', 'unknown') |
| 251 | + version = package_info.get('version', '0.0.0') |
| 252 | + architecture = package_info.get('architecture', 'all') |
| 253 | + |
| 254 | + # For binary packages, the filename format is: |
| 255 | + # package_version_architecture.deb |
| 256 | + filename = f"{package}_{version}_{architecture}.deb" |
| 257 | + |
| 258 | + return filename |
| 259 | + |
| 260 | +# Example usage: |
| 261 | +if __name__ == "__main__": |
| 262 | + releases = get_debian_release_names() |
| 263 | + if('debian/rc-buggy' in releases): |
| 264 | + releases.remove('debian/rc-buggy') |
| 265 | + # FIXME: these are fetchable from changelogs.ubuntu.com/meta-release |
| 266 | + # filter by 'Supported: 1'. |
| 267 | + # Don't do this yet b/c jammy goes EOS Apr 2027, we don't know if we'll be ready. |
| 268 | + # also resolute isn't in changelog as of 2025Dec03 |
| 269 | + releases += [ 'ubuntu/jammy', 'ubuntu/noble', 'ubuntu/plucky', 'ubuntu/questing', 'ubuntu/resolute' ] |
| 270 | + release_hash = {} |
| 271 | + for release in releases: |
| 272 | + distro, release = release.split('/') |
| 273 | + packages = {} |
| 274 | + |
| 275 | + pkg_architecture = get_debian_srcpkg_architecture(distro, release, "base-files") |
| 276 | + |
| 277 | + # Get architectures from InRelease |
| 278 | + print("\n=== Architecture List ===") |
| 279 | + arch_list = pkg_architecture.split() |
| 280 | + if( 'any' in arch_list ): |
| 281 | + architectures = get_debian_architectures(distro, release) |
| 282 | + else: |
| 283 | + architectures = arch_list |
| 284 | + if( release == 'sid' ): |
| 285 | + # loong64 is hidden away in /debian-ports/ |
| 286 | + architectures += ['loong64'] |
| 287 | + |
| 288 | + # Get binary package filename |
| 289 | + #print("\n=== Binary Package ===") |
| 290 | + # NOTE: we *cheat* here because base-files is always built for all architectures. |
| 291 | + # this is NOT a generic method usable for all cases. for that you have to check Sources above |
| 292 | + for architecture in architectures: |
| 293 | + binary_filename = get_debian_binary_package_filename(distro, release, "base-files", architecture) |
| 294 | + packages[architecture] = binary_filename |
| 295 | + release_hash[release] = packages |
| 296 | + |
| 297 | + json_content = json.dumps(release_hash) |
| 298 | + print(json_content) |
| 299 | + json_file_name = "base-files.json" |
50 | 300 | with open(json_file_name, "w") as outfile: |
51 | | - json.dump(all_info_result, outfile) |
52 | | -else: |
53 | | - print("failed to get package info") |
54 | | - sys.exit(1) |
| 301 | + outfile.write(json_content) |
0 commit comments