Skip to content

Commit

Permalink
feat: support for semgrep rules, currently two implemented, with cust…
Browse files Browse the repository at this point in the history
…om options
  • Loading branch information
art1f1c3R committed Jan 30, 2025
1 parent 38cc36b commit c87c685
Show file tree
Hide file tree
Showing 5 changed files with 441 additions and 99 deletions.
6 changes: 3 additions & 3 deletions src/macaron/config/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,6 @@ epoch_threshold = 3
# The number of days +/- the day of publish the calendar versioning day may be.
day_publish_error = 4

# yaml configuration file containing suspicious patterns. Can be full path or relative to
# folder where macaron is installed. This will be normalised to the OS path type.
suspicious_patterns_file = src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml
# absolute path to where a custom set of semgrep rules for source code analysis are stored. These will be included
# with Macaron's default rules. The path will be normalised to the OS path type.
custom_semgrep_rules =
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@
import subprocess # nosec
import tempfile
from collections import defaultdict
from typing import Any

import yaml

from macaron.config.defaults import defaults
from macaron.config.global_config import global_config
from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError
from macaron.json_tools import JsonType, json_extract
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
Expand Down Expand Up @@ -58,79 +56,61 @@ class PyPISourcecodeAnalyzer:

def __init__(self) -> None:
"""Collect required data for analysing the source code."""
self.suspicious_patterns = self._load_defaults()
self.rule_files: list = []
self.default_rule_path, self.custom_rule_path = self._load_defaults()

def _load_defaults(self) -> tuple[str, str | None]:
"""
Load the default semgrep rules and, if present, the custom semgrep rules provided by the user.
def _load_defaults(self) -> dict[str, dict[str, list]]:
"""Load the suspicious pattern from suspicious_pattern.yaml.
Semgrep validation is run on the custom rules provided by the user.
Returns
-------
dict[str: dict[str, list]]
The suspicious pattern.
tuple[str, str | None]
The default rule path and the custom rule path or None if one was not provided
Raises
------
ConfigurationError
if the suspicious pattern file is not in the expected format or cannot be accessed.
If the heuristic.pypi entry is not present, or if the semgrep validation of the custom rule path failed.
"""
suspicious_patterns: dict[str, dict[str, list]] = {}
default_rule_path = os.path.join(global_config.resources_path, "pypi_malware_rules")
section_name = "heuristic.pypi"

if defaults.has_section(section_name):
section = defaults[section_name]
else:
error_msg = f"Unable to find section {section_name}, which is required to load suspicious patterns."
error_msg = f"Unable to find section {section_name}, which must be present."
logger.debug(error_msg)
raise ConfigurationError(error_msg)

configuration_name = "suspicious_patterns_file"
filename = section.get(configuration_name)
if filename is None:
error_msg = f"Unable to find {configuration_name} in configuration file."
configuration_name = "custom_semgrep_rules"
custom_rule_path = section.get(configuration_name)
if not custom_rule_path: # i.e. None or empty string
logger.debug("No custom path listed under %s, using default rules only.", configuration_name)
return default_rule_path, None

custom_rule_path = os.path.normpath(custom_rule_path)
if not os.path.exists(custom_rule_path):
error_msg = f"Unable to locate path {custom_rule_path}"
logger.debug(error_msg)
raise ConfigurationError(error_msg)

filename = os.path.normpath(filename)
semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--config", custom_rule_path]
try:
with open(filename, encoding="utf-8") as file:
configured_patterns: dict[str, JsonType] = yaml.safe_load(file)
except FileNotFoundError as file_error:
error_msg = f"Unable to locate {filename}"
logger.debug(error_msg)
raise ConfigurationError(error_msg) from file_error
except yaml.YAMLError as yaml_error:
error_msg = f"Unable to parse {filename} as a yaml file."
process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error:
error_msg = f"Unable to run semgrep validation on {custom_rule_path} with arguments {semgrep_commands}: {semgrep_error}"
logger.debug(error_msg)
raise ConfigurationError(error_msg) from yaml_error
raise ConfigurationError(error_msg) from semgrep_error

for expected_category in self.EXPECTED_PATTERN_CATEGORIES:
if expected_category not in configured_patterns:
error_msg = (
f"Expected suspicious pattern category {expected_category} present in"
+ f" {filename}: must have categories {self.EXPECTED_PATTERN_CATEGORIES}"
)
logger.debug(error_msg)
raise ConfigurationError(error_msg)

for category, patterns in configured_patterns.items():
suspicious_patterns[category] = {}
if isinstance(patterns, list):
suspicious_patterns[category][category] = patterns
elif isinstance(patterns, dict):
for subcategory, subpatterns in patterns.items():
if not isinstance(subpatterns, list):
error_msg = f"Expected subcategory {subcategory} items to be" + f" a list in {filename}"
logger.debug(error_msg)
raise ConfigurationError(error_msg)

suspicious_patterns[category][subcategory] = subpatterns
else:
error_msg = f"Expected category {category} to be either a list" + f" or dictionary in {filename}"
logger.debug(error_msg)
raise ConfigurationError(error_msg)
if process.returncode != 0:
error_msg = f"Error running semgrep validation on {custom_rule_path} with arguments" f" {process.args}"
logger.debug(error_msg)
raise ConfigurationError(error_msg)

return suspicious_patterns
logger.debug("Including custom ruleset from %s.", custom_rule_path)
return default_rule_path, custom_rule_path

def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the source code of the package for malicious patterns.
Expand Down Expand Up @@ -162,9 +142,9 @@ def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

self._create_rules()
for rule_file in self.rule_files:
semgrep_commands.extend(["--config", rule_file.name])
semgrep_commands.extend(["--config", self.default_rule_path])
if self.custom_rule_path:
semgrep_commands.extend(["--config", self.custom_rule_path])
semgrep_commands.append(source_code_path)

with tempfile.NamedTemporaryFile(mode="w+", delete=True) as output_json_file:
Expand Down Expand Up @@ -203,8 +183,6 @@ def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu
end = json_extract(finding, ["end", "line"], int)
analysis_result[category].append({"file": file, "start": start, "end": end})

self._clear_rules()

return result, dict(analysis_result)

def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
Expand Down Expand Up @@ -258,44 +236,6 @@ def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu

return result, analysis_result

def _create_rules(self) -> None:
rule_list: list[dict[str, Any]] = []
contents: dict = {}

if self.rule_files:
self._clear_rules()

# import rules
for category, patterns in self.suspicious_patterns[IMPORTS].items():
rule: dict[str, Any] = {}
pattern_list: list = []

rule["id"] = category
rule["severity"] = "ERROR"
rule["languages"] = ["python"]
rule["message"] = f"Detected suspicious imports from the '{category}' category"

for pattern in patterns:
pattern_list.append({"pattern": f"import {pattern}"})
pattern_list.append({"pattern": f"from {pattern} import $X"})
pattern_list.append({"pattern": f'__import__("{pattern}")'})

rule["pattern-either"] = pattern_list
rule_list.append(rule)

contents = {"rules": rule_list}

with tempfile.NamedTemporaryFile(
"w", prefix=f"{IMPORTS}_", suffix=".yaml", delete=False
) as import_patterns_file:
yaml.dump(contents, import_patterns_file)
self.rule_files.append(import_patterns_file)

def _clear_rules(self) -> None:
for file in self.rule_files:
file.close()
self.rule_files.clear()


class DataFlowTracer(ast.NodeVisitor):
"""The class is used to create the symbol table and analyze the dataflow."""
Expand Down
146 changes: 146 additions & 0 deletions src/macaron/resources/pypi_malware_rules/exfiltration.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

rules:
- id: remote-exfiltration
metadata:
description: Detected the exfiltration of data to a remote endpoint
message: Detected exfiltration of sensitive data to a remote endpoint.
languages:
- python
severity: ERROR
mode: taint
options:
symbolic_propagation: true
pattern-sources:
- pattern-either:
# result of code/command evaluation
- pattern: exec(...)
- pattern: eval(...)
- pattern: ast.literal_eval(...)
- pattern: builtins.exec(...)
- pattern: builtins.eval(...)
- pattern: __import__('builtins').exec(...)
- pattern: __import__('builtins').eval(...)

# environment variables
- pattern: os.environ
- pattern: os.environ[...]
- pattern: os.environ.get(...)
- pattern: os.environb
- pattern: os.environb[...]
- pattern: os.environb.get(...)
- pattern: os.getenv(...)
- pattern: os.getenvb(...)

# system information
- pattern: os.uname(...)
- pattern: os.confstr(...)
- pattern: os.confstr_names
- pattern: os.sysconf(...)
- pattern: os.sysconf_names
- pattern: platform.release(...)
- pattern: platform.version(...)
- pattern: platform.uname(...)
- pattern: platform.win32_ver(...)
- pattern: platform.win32_edition(...)
- pattern: platform.win32_is_iot(...)
- pattern: platform.mac_ver(...)
- pattern: platform.ios_ver(...)
- pattern: platform.libc_ver(...)
- pattern: platform.freedesktop_os_release(...)
- pattern: platform.android_ver(...)

# network information
- pattern: psutil.net_connections(...)
- pattern: psutil.net_if_addrs(...)
- pattern: psutil.net_if_stats(...)
- pattern: platform.node(...)
- pattern: platform.platform(...)
- pattern: socket.gethostname(...)
- pattern: socket.gethostbyname(...)
- pattern: socket.gethostbyname_ex(...)
- pattern: socket.getfqdn(...)
- pattern: socket.if_nameindex(...)

# user information
- pattern: psutil.users(...)

# sensitive information
- pattern: getpass.getpass(...)
- pattern: getpass.unix_getpass(...)
- pattern: getpass.win_getpass(...)
- pattern: getpass.getuser(...)
- pattern: pwd.getpwuid(...)
- pattern: pwd.getpwnam(...)
- pattern: pwd.getpwall(...)
- pattern: keyring.get_keyring(...)
- pattern: keyring.get_password(...)
- pattern: keyring.get_credential(...)
- pattern: winreg.ConnectRegistry(...)
- pattern: winreg.LoadKey(...)
- pattern: winreg.OpenKey(...)
- pattern: winreg.OpenKeyEx(...)
- pattern: winreg.QueryInfoKey(...)
- pattern: winreg.QueryValue(...)
- pattern: winreg.QueryValueEx(...)

pattern-sinks:
- pattern-either:
# remote connection
# using socket module
- pattern: socket.socket(...)
- pattern: $SOC.accept(...)
- pattern: $SOC.bind(...)
- pattern: $SOC.connect(...)
- pattern: $SOC.connect_ex(...)
- pattern: $SOC.listen(...)
- pattern: $SOC.recv(...)
- pattern: $SOC.recvfrom(...)
- pattern: $SOC.recvmsg(...)
- pattern: $SOC.recvmsg_into(...)
- pattern: $SOC.recvfrom_into(...)
- pattern: $SOC.recv_into(...)
- pattern: $SOC.send(...)
- pattern: $SOC.sendall(...)
- pattern: $SOC.sendto(...)
- pattern: $SOC.sendmsg(...)
- pattern: $SOC.sendmsg_afalg(...)
- pattern: $SOC.sendfile(...)
# using requests module
- pattern: requests.get(...)
- pattern: requests.post(...)
- pattern: requests.put(...)
- pattern: requests.delete(...)
- pattern: requests.head(...)
- pattern: requests.options(...)
- pattern: requests.Session(...)
- pattern: requests.Request(...)
# using urllib3 module
- pattern: urllib3.PoolManager(...)
- pattern: urllib3.request(...)
- pattern: urllib3.HTTPConnectionPool(...)
- pattern: urllib3.HTTPSConnectionPool(...)
- pattern: urllib3.ConnectionPool(...)
- pattern: urllib3.ProxyManager(...)
- pattern: urllib3.contrib.socks.SOCKSProxyManager(...)
# using urllib
- pattern: urllib.request(...)
- pattern: urllib.request.urlopen(...)
# using urlrequest module
- pattern: UrlRequest(...)
- pattern: UrlRequestRequests(...)
- pattern: UrlRequestUrllib(...)
# using httpx
- pattern: httpx.request(...)
- pattern: httpx.get(...)
- pattern: httpx.post(...)
- pattern: httpx.put(...)
- pattern: httpx.delete(...)
- pattern: httpx.head(...)
- pattern: httpx.options(...)
- pattern: httpx.stream(...)
- pattern: httpx.AsyncClient(...)
- pattern: httpx.AsyncHTTPTransport(...)
- pattern: httpx.Client(...)
- pattern: httpx.Request(...)
Loading

0 comments on commit c87c685

Please sign in to comment.