|
|
Кафедра юридических дисциплин
# Read the uploaded file, parse entries, classify by type (FOS vs RP), then build an HTML table and save to /mnt/data/таблица_дисциплин.html
import re, html, os
in_path = '/mnt/data/новый 1.txt'
with open(in_path, 'r', encoding='utf-8', errors='ignore') as f:
txt = f.read()
# Extract all ... blocks
blocks = re.findall(r'( .*?)', txt, flags=re.DOTALL|re.IGNORECASE)
entries = [] # list of dicts: {type, code, name, href_type, href}
for b in blocks:
# find all anchors inside block
anchors = re.findall(r']*href=["\']([^"\']+)["\'][^>]*>(.*?)', b, flags=re.DOTALL|re.IGNORECASE)
if anchors:
# For each anchor in block, create an entry
for href, inner in anchors:
# clean inner text: remove tags, spans, etc.
text = re.sub(r'<[^>]+>', '', inner)
text = html.unescape(text).strip()
# sometimes block has additional text outside anchor; ignore for this anchor
href_type = 'fos' if re.search(r'/fos/|/FOS/', href, re.IGNORECASE) else ('rp' if re.search(r'/rp/|/rpd/|/rp_', href, re.IGNORECASE) else 'other')
# try to split code and name
m = re.match(r'^\s*([А-ЯЁA-Z.\d\-]+)\s*(.+)$', text)
if m:
code = m.group(1).strip()
name = m.group(2).strip()
else:
# fallback: first word as code, rest as name
parts = text.split(None,1)
code = parts[0] if parts else ''
name = parts[1] if len(parts)>1 else ''
entries.append({'code': code, 'name': name, 'type': code.split('.')[0] if '.' in code else code.split()[0] if code else '', 'href_type': href_type, 'href': href, 'text': text})
else:
# No anchor: get plain text inside p
inner = re.sub(r'<[^>]+>', '', b)
inner = html.unescape(inner).strip()
if not inner:
continue
# sometimes headings like "МДК" or combined lines like "ОП.01 ... ООО"
m = re.match(r'^\s*([А-ЯЁA-Z.\d\-]+)\s*(.+)$', inner)
if m:
code = m.group(1).strip()
name = m.group(2).strip()
entries.append({'code': code, 'name': name, 'type': code.split('.')[0] if '.' in code else code.split()[0] if code else '', 'href_type': None, 'href': None, 'text': inner})
else:
# Could be a section heading like "МДК" alone
entries.append({'code': '', 'name': inner, 'type': inner.strip(), 'href_type': None, 'href': None, 'text': inner})
# Consolidate entries by exact text (name) and collect fos and rp links
rows = {}
for e in entries:
key = e['text'] # use full visible text as key to preserve duplicates (ООО/СОО)
if key not in rows:
rows[key] = {'code': e.get('code',''), 'name': e.get('name',''), 'type': e.get('type',''), 'fos': None, 'rp': None, 'other': None}
if e.get('href_type') == 'fos':
rows[key]['fos'] = e.get('href')
elif e.get('href_type') == 'rp':
rows[key]['rp'] = e.get('href')
elif e.get('href_type') == 'other':
rows[key]['other'] = e.get('href')
else:
# plain text entry: nothing to add; code/name already stored
pass
|



|