import json
import re
from pathlib import Path
COMPRESSIBLE_EXTENSIONS = {".md", ".txt", ".markdown", ".rst"}
SKIP_EXTENSIONS = {
".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
".toml", ".env", ".lock", ".css", ".scss", ".html", ".xml",
".sql", ".sh", ".bash", ".zsh", ".go", ".rs", ".java", ".c",
".cpp", ".h", ".hpp", ".rb", ".php", ".swift", ".kt", ".lua",
".dockerfile", ".makefile", ".csv", ".ini", ".cfg",
}
CODE_PATTERNS = [
re.compile(r"^\s*(import |from .+ import |require\(|const |let |var )"),
re.compile(r"^\s*(def |class |function |async function |export )"),
re.compile(r"^\s*(if\s*\(|for\s*\(|while\s*\(|switch\s*\(|try\s*\{)"),
re.compile(r"^\s*[\}\]\);]+\s*$"), re.compile(r"^\s*@\w+"), re.compile(r'^\s*"[^"]+"\s*:\s*'), re.compile(r"^\s*\w+\s*=\s*[{\[\(\"']"), ]
def _is_code_line(line: str) -> bool:
return any(p.match(line) for p in CODE_PATTERNS)
def _is_json_content(text: str) -> bool:
try:
json.loads(text)
return True
except (json.JSONDecodeError, ValueError):
return False
def _is_yaml_content(lines: list[str]) -> bool:
yaml_indicators = 0
for line in lines[:30]:
stripped = line.strip()
if stripped.startswith("---"):
yaml_indicators += 1
elif re.match(r"^\w[\w\s]*:\s", stripped):
yaml_indicators += 1
elif stripped.startswith("- ") and ":" in stripped:
yaml_indicators += 1
non_empty = sum(1 for l in lines[:30] if l.strip())
return non_empty > 0 and yaml_indicators / non_empty > 0.6
def detect_file_type(filepath: Path) -> str:
ext = filepath.suffix.lower()
if ext in COMPRESSIBLE_EXTENSIONS:
return "natural_language"
if ext in SKIP_EXTENSIONS:
return "code" if ext not in {".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".env"} else "config"
if not ext:
try:
text = filepath.read_text(errors="ignore")
except (OSError, PermissionError):
return "unknown"
lines = text.splitlines()[:50]
if _is_json_content(text[:10000]):
return "config"
if _is_yaml_content(lines):
return "config"
code_lines = sum(1 for l in lines if l.strip() and _is_code_line(l))
non_empty = sum(1 for l in lines if l.strip())
if non_empty > 0 and code_lines / non_empty > 0.4:
return "code"
return "natural_language"
return "unknown"
def should_compress(filepath: Path) -> bool:
if not filepath.is_file():
return False
if filepath.name.endswith(".original.md"):
return False
return detect_file_type(filepath) == "natural_language"
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python detect.py <file1> [file2] ...")
sys.exit(1)
for path_str in sys.argv[1:]:
p = Path(path_str).resolve()
file_type = detect_file_type(p)
compress = should_compress(p)
print(f" {p.name:30s} type={file_type:20s} compress={compress}")