Skip to content

Commit 00d9780

Browse files
tabrisnetigorpecovnik
authored andcommitted
generate-base-files-info-json.py - base-files info script rewrite to use apt repo-data
most code written with qwen3-coder:30b
1 parent 26a16ac commit 00d9780

File tree

1 file changed

+297
-50
lines changed

1 file changed

+297
-50
lines changed
Lines changed: 297 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,301 @@
1-
#!/usr/bin/python3
1+
#!/usr/bin/env python3
2+
3+
#NOTE: most of this code written with qwen3-coder:30b
4+
5+
import os
26
import requests
3-
from lxml import etree
7+
from pathlib import Path
48
import json
5-
import sys
6-
def get_package_info_from_upstream(distro, package_name):
7-
if distro == 'debian':
8-
distro_url = "https://packages.debian.org/search?keywords=" + package_name + "&searchon=names&suite=all&section=all"
9-
elif distro == 'ubuntu':
10-
distro_url = "https://packages.ubuntu.com/search?keywords=" + package_name + "&searchon=names&suite=all&section=all"
9+
import re
10+
import gzip
11+
from urllib.parse import urljoin
12+
13+
def get_debian_release_names(cache_dir="./debian_cache"):
14+
"""
15+
Get Debian release names from the README file
16+
"""
17+
# Create cache directory if it doesn't exist
18+
Path(cache_dir).mkdir(exist_ok=True)
19+
20+
# Build URL
21+
readme_url = "http://deb.debian.org/debian/dists/README"
22+
readme_path = os.path.join(cache_dir, "README")
23+
24+
# Check if we already have the README file
25+
if os.path.exists(readme_path):
26+
print(f"Using cached README: {readme_path}")
27+
with open(readme_path, 'r') as f:
28+
readme_content = f.read()
1129
else:
12-
print("invalid distro %s, quit" % distro)
13-
sys.exit(1)
14-
# Step 1: Fetch HTML content from the URL
15-
response = requests.get(distro_url)
16-
html_content = response.content # Use .content for lxml to handle byte data
17-
# Step 2: Parse HTML with lxml
18-
parser = etree.HTMLParser()
19-
tree = etree.fromstring(html_content, parser)
20-
# Step 3: Extract data
21-
for h3 in tree.xpath('//h3'):
22-
section_title = h3.text
23-
ul = h3.xpath('./following-sibling::ul[1]')
24-
debian_all_package_info = {}
25-
if ul:
26-
list_items = ul[0].xpath('.//li')
27-
for li in list_items:
28-
debian_package_info = {}
29-
item_text = li.xpath('.//text()[not(parent::a)]')
30-
item_class = li.get("class")
31-
package_file_release = item_class
32-
package_file_version = item_text[1].split(":")[0]
33-
architectures = ["arm64", "armhf", "amd64", "riscv64", "loong64"]
34-
arch_info = item_text[1].split(":")[1]
35-
for arch in architectures:
36-
if arch in arch_info:
37-
package_filename = f"{package_name}_{package_file_version}_{arch}.deb"
38-
debian_package_info[arch] = package_filename
39-
debian_all_package_info[item_class] = debian_package_info
40-
return debian_all_package_info
41-
if len(sys.argv) < 2:
42-
print("Usage: python parse.py <package_name>")
43-
sys.exit(1)
44-
package_name = sys.argv[1]
45-
debian_info = get_package_info_from_upstream("debian", package_name)
46-
ubuntu_info = get_package_info_from_upstream("ubuntu", package_name)
47-
if debian_info and ubuntu_info:
48-
all_info_result = {**debian_info, **ubuntu_info}
49-
json_file_name = package_name + ".json"
30+
print("Downloading README...")
31+
response = requests.get(readme_url, timeout=30)
32+
response.raise_for_status()
33+
readme_content = response.text
34+
35+
# Save to cache
36+
with open(readme_path, 'w') as f:
37+
f.write(readme_content)
38+
39+
# Extract release names using regex
40+
# Pattern: \S+, or (\S+)\s+ - matches "oldstable, or bookworm" and captures "bookworm"
41+
release_pattern = r'\S+, or (\S+)\s+'
42+
43+
releases = []
44+
for line in readme_content.split('\n'):
45+
if line.strip():
46+
match = re.search(release_pattern, line)
47+
if match:
48+
release_name = match.group(1)
49+
releases.append(f"debian/{release_name}")
50+
print(f"Found release: {release_name}")
51+
52+
return releases
53+
54+
def get_debian_architectures(distro, release_name, cache_dir="./debian_cache"):
55+
"""
56+
Get supported architectures for a Debian release from InRelease file
57+
"""
58+
# Create cache directory if it doesn't exist
59+
Path(cache_dir).mkdir(exist_ok=True)
60+
61+
# Build URLs
62+
match distro:
63+
case 'debian':
64+
base_url = "http://deb.debian.org/debian"
65+
case 'ubuntu':
66+
base_url = "http://archive.ubuntu.com/ubuntu"
67+
inrelease_url = f"{base_url}/dists/{release_name}/InRelease"
68+
inrelease_path = os.path.join(cache_dir, f"{release_name}_InRelease")
69+
70+
# Check if we already have the file
71+
if os.path.exists(inrelease_path):
72+
#print(f"Using cached file: {inrelease_path}")
73+
with open(inrelease_path, 'r') as f:
74+
inrelease_content = f.read()
75+
else:
76+
#print(f"Downloading InRelease for {release_name}...")
77+
response = requests.get(inrelease_url, timeout=30)
78+
response.raise_for_status()
79+
inrelease_content = response.text
80+
81+
# Save to cache
82+
with open(inrelease_path, 'w') as f:
83+
f.write(inrelease_content)
84+
85+
# Extract architectures from the InRelease file
86+
# Look for the "Architectures:" line
87+
architectures = []
88+
89+
# Split by lines and look for architectures
90+
for line in inrelease_content.split('\n'):
91+
if line.lower().startswith('architectures:'):
92+
# Extract architectures after the colon
93+
arch_line = line.split(':', 1)[1].strip()
94+
architectures = [arch.strip() for arch in arch_line.split() if arch.strip()]
95+
break
96+
97+
if architectures:
98+
print(f"Supported architectures for {release_name}: {architectures}")
99+
if('all' in architectures):
100+
architectures.remove('all')
101+
return architectures
102+
else:
103+
print("Could not find Architectures field in InRelease file")
104+
return []
105+
106+
def get_debian_srcpkg_architecture(distro, release_name, package_name, cache_dir="./debian_cache"):
107+
"""
108+
Get the synthesized package filename for a given package in a Debian release
109+
"""
110+
# Create cache directory if it doesn't exist
111+
Path(cache_dir).mkdir(exist_ok=True)
112+
113+
# Build URLs
114+
match distro:
115+
case 'debian':
116+
base_url = "http://deb.debian.org/debian"
117+
case 'ubuntu':
118+
#base_url = "http://archive.ubuntu.com/ubuntu"
119+
base_url = "http://ports.ubuntu.com/"
120+
121+
sources_url = f"{base_url}/dists/{release_name}/main/source/Sources.gz"
122+
sources_path = os.path.join(cache_dir, f"{release_name}_Sources.gz")
123+
124+
# Check if we already have the Sources.gz file
125+
if os.path.exists(sources_path):
126+
print(f"Using cached Sources.gz: {sources_path}")
127+
else:
128+
print(f"Downloading Sources.gz for {release_name}...")
129+
response = requests.get(sources_url, timeout=30)
130+
response.raise_for_status()
131+
132+
# Save to cache
133+
with open(sources_path, 'wb') as f:
134+
f.write(response.content)
135+
136+
# Decompress and read
137+
with gzip.open(sources_path, 'rt') as f:
138+
sources_content = f.read()
139+
140+
# Parse the Sources file to find the package
141+
package_info = parse_sources_for_package(sources_content, package_name)
142+
143+
if package_info:
144+
return package_info['architecture']
145+
else:
146+
raise FileNotFoundError(f"Package '{package_name}' not found in {distro}/{release_name} Sources.gz")
147+
148+
def parse_sources_for_package(sources_content, package_name):
149+
"""
150+
Parse Sources.gz content to find package information
151+
"""
152+
# Split into individual package entries
153+
packages = sources_content.split('\n\n')
154+
155+
for package_entry in packages:
156+
if not package_entry.strip():
157+
continue
158+
159+
package_info = {}
160+
for line in package_entry.split('\n'):
161+
if ':' in line:
162+
key, value = line.split(':', 1)
163+
package_info[key.strip().lower()] = value.strip()
164+
165+
# Check if this is our package
166+
if package_info.get('package', '').lower() == package_name.lower():
167+
return package_info
168+
169+
return None
170+
171+
def get_debian_binary_package_filename(distro, release_name, package_name, architecture='arm64', cache_dir="./debian_cache"):
172+
"""
173+
Get the binary package filename for a given package in a Debian release
174+
This is more complex because we need to parse Packages files
175+
"""
176+
# Create cache directory if it doesn't exist
177+
Path(cache_dir).mkdir(exist_ok=True)
178+
179+
# Build URLs for Packages file
180+
match distro:
181+
case 'debian':
182+
if( architecture == 'loong64' ):
183+
base_url = "http://ftp.ports.debian.org/debian-ports/"
184+
else:
185+
base_url = "http://ftp.debian.org/debian/"
186+
case 'ubuntu':
187+
if(re.match("(i386|amd64)", architecture)): #regex as there is amd64 and amd64v3
188+
base_url = "http://archive.ubuntu.com/ubuntu"
189+
else:
190+
base_url = "http://ports.ubuntu.com/"
191+
packages_url = f"{base_url}/dists/{release_name}/main/binary-{architecture}/Packages.gz"
192+
packages_path = os.path.join(cache_dir, f"{release_name}_{architecture}_Packages.gz")
193+
194+
# Check if we already have the Packages.gz file
195+
if os.path.exists(packages_path):
196+
print(f"Using cached Packages.gz: {packages_path}")
197+
else:
198+
print(f"Downloading Packages.gz for {release_name} ({architecture})...")
199+
response = requests.get(packages_url, timeout=30)
200+
response.raise_for_status()
201+
202+
# Save to cache
203+
with open(packages_path, 'wb') as f:
204+
f.write(response.content)
205+
206+
# Decompress and read
207+
with gzip.open(packages_path, 'rt') as f:
208+
packages_content = f.read()
209+
210+
# Parse the Packages file to find the package
211+
package_info = parse_packages_for_package(packages_content, package_name)
212+
213+
if package_info:
214+
# Synthesize the package filename
215+
filename = synthesize_binary_package_filename(package_info)
216+
#print(f"Synthesized binary package filename: {filename}")
217+
return filename
218+
else:
219+
print(f"Binary package '{package_name}' not found for {architecture}/Packages.gz")
220+
return None
221+
222+
def parse_packages_for_package(packages_content, package_name):
223+
"""
224+
Parse Packages.gz content to find package information
225+
"""
226+
# Split into individual package entries
227+
packages = packages_content.split('\n\n')
228+
229+
for package_entry in packages:
230+
if not package_entry.strip():
231+
continue
232+
233+
package_info = {}
234+
for line in package_entry.split('\n'):
235+
if ':' in line:
236+
key, value = line.split(':', 1)
237+
package_info[key.strip().lower()] = value.strip()
238+
239+
# Check if this is our package
240+
if package_info.get('package', '').lower() == package_name.lower():
241+
return package_info
242+
243+
return None
244+
245+
def synthesize_binary_package_filename(package_info):
246+
"""
247+
Synthesize the Debian binary package filename from package info
248+
"""
249+
# Extract needed fields
250+
package = package_info.get('package', 'unknown')
251+
version = package_info.get('version', '0.0.0')
252+
architecture = package_info.get('architecture', 'all')
253+
254+
# For binary packages, the filename format is:
255+
# package_version_architecture.deb
256+
filename = f"{package}_{version}_{architecture}.deb"
257+
258+
return filename
259+
260+
# Example usage:
261+
if __name__ == "__main__":
262+
releases = get_debian_release_names()
263+
if('debian/rc-buggy' in releases):
264+
releases.remove('debian/rc-buggy')
265+
# FIXME: these are fetchable from changelogs.ubuntu.com/meta-release
266+
# filter by 'Supported: 1'.
267+
# Don't do this yet b/c jammy goes EOS Apr 2027, we don't know if we'll be ready.
268+
# also resolute isn't in changelog as of 2025Dec03
269+
releases += [ 'ubuntu/jammy', 'ubuntu/noble', 'ubuntu/plucky', 'ubuntu/questing', 'ubuntu/resolute' ]
270+
release_hash = {}
271+
for release in releases:
272+
distro, release = release.split('/')
273+
packages = {}
274+
275+
pkg_architecture = get_debian_srcpkg_architecture(distro, release, "base-files")
276+
277+
# Get architectures from InRelease
278+
print("\n=== Architecture List ===")
279+
arch_list = pkg_architecture.split()
280+
if( 'any' in arch_list ):
281+
architectures = get_debian_architectures(distro, release)
282+
else:
283+
architectures = arch_list
284+
if( release == 'sid' ):
285+
# loong64 is hidden away in /debian-ports/
286+
architectures += ['loong64']
287+
288+
# Get binary package filename
289+
#print("\n=== Binary Package ===")
290+
# NOTE: we *cheat* here because base-files is always built for all architectures.
291+
# this is NOT a generic method usable for all cases. for that you have to check Sources above
292+
for architecture in architectures:
293+
binary_filename = get_debian_binary_package_filename(distro, release, "base-files", architecture)
294+
packages[architecture] = binary_filename
295+
release_hash[release] = packages
296+
297+
json_content = json.dumps(release_hash)
298+
print(json_content)
299+
json_file_name = "base-files.json"
50300
with open(json_file_name, "w") as outfile:
51-
json.dump(all_info_result, outfile)
52-
else:
53-
print("failed to get package info")
54-
sys.exit(1)
301+
outfile.write(json_content)

0 commit comments

Comments
 (0)