Pushed by: admin License: RB02Z2FI66TO (Trial) Timestamp: 2026-03-26T23:49:16.182698masterdevtest
parent
0645a74478
commit
5719653c80
@ -1,2 +0,0 @@
|
|||||||
2.1.0
|
|
||||||
2.1.0
|
|
||||||
@ -1,54 +0,0 @@
|
|||||||
{
|
|
||||||
"dependencies": null,
|
|
||||||
"incompatibleApps": null,
|
|
||||||
"info": {
|
|
||||||
"author": [
|
|
||||||
{
|
|
||||||
"company": "Bitwarden Inc.",
|
|
||||||
"email": "support@bitwarden.com",
|
|
||||||
"name": "Bitwarden Inc."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"classification": {
|
|
||||||
"categories": [],
|
|
||||||
"developmentStatus": null,
|
|
||||||
"intendedAudience": null
|
|
||||||
},
|
|
||||||
"commonInformationModels": null,
|
|
||||||
"description": "Bitwarden event logs integration into Splunk.",
|
|
||||||
"id": {
|
|
||||||
"group": null,
|
|
||||||
"name": "bitwarden_event_logs",
|
|
||||||
"version": "2.1.0"
|
|
||||||
},
|
|
||||||
"license": {
|
|
||||||
"name": null,
|
|
||||||
"text": null,
|
|
||||||
"uri": null
|
|
||||||
},
|
|
||||||
"privacyPolicy": {
|
|
||||||
"name": null,
|
|
||||||
"text": null,
|
|
||||||
"uri": null
|
|
||||||
},
|
|
||||||
"releaseDate": null,
|
|
||||||
"releaseNotes": {
|
|
||||||
"name": "README",
|
|
||||||
"text": "README.txt",
|
|
||||||
"uri": ""
|
|
||||||
},
|
|
||||||
"title": "Bitwarden Event Logs"
|
|
||||||
},
|
|
||||||
"inputGroups": null,
|
|
||||||
"platformRequirements": null,
|
|
||||||
"schemaVersion": "2.0.0",
|
|
||||||
"supportedDeployments": [
|
|
||||||
"_standalone",
|
|
||||||
"_distributed"
|
|
||||||
],
|
|
||||||
"targetWorkloads": [
|
|
||||||
"_search_heads",
|
|
||||||
"_indexers"
|
|
||||||
],
|
|
||||||
"tasks": null
|
|
||||||
}
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
[eventsapi]
|
|
||||||
enforceTypes = true
|
|
||||||
field.next_request.start = string
|
|
||||||
field.next_request.end = string
|
|
||||||
field.next_request.continuation_token = string
|
|
||||||
field.last_log_date = string
|
|
||||||
@ -1,147 +0,0 @@
|
|||||||
<form version="1.1">
|
|
||||||
<label>Bitwarden Authentication Events</label>
|
|
||||||
<fieldset submitButton="false" autoRun="false">
|
|
||||||
<input type="time" token="timeframe">
|
|
||||||
<label>Timeframe</label>
|
|
||||||
<default>
|
|
||||||
<earliest>-14h@h</earliest>
|
|
||||||
<latest>now</latest>
|
|
||||||
</default>
|
|
||||||
</input>
|
|
||||||
<input type="dropdown" token="top_users_by">
|
|
||||||
<label>Top Users By</label>
|
|
||||||
<default>User Email</default>
|
|
||||||
<choice value="actingUserId">User ID</choice>
|
|
||||||
<choice value="actingUserEmail">User Email</choice>
|
|
||||||
<choice value="actingUserName">User Name</choice>
|
|
||||||
</input>
|
|
||||||
</fieldset>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<map>
|
|
||||||
<title>Successful Log In Attempts</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" type=1000 | iplocation ipAddress | lookup geo_countries longitude as lon, latitude as lat | stats count by Country | geom geo_countries featureIdField=Country</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="drilldown">none</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorBins">5</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorMode">auto</option>
|
|
||||||
<option name="mapping.choroplethLayer.maximumColor">0x53a051</option>
|
|
||||||
<option name="mapping.map.center">(55.97,-40.69)</option>
|
|
||||||
<option name="mapping.map.zoom">3</option>
|
|
||||||
<option name="mapping.type">choropleth</option>
|
|
||||||
</map>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Authentication Events by Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1000 OR type=1001 OR type=1002 OR type=1003 OR type=1004 OR type=1005 OR type=1006 OR type=1008 OR type=1009) | timechart count by deviceName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Authentication Events by Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1000 OR type=1001 OR type=1002 OR type=1003 OR type=1004 OR type=1005 OR type=1006 OR type=1008 OR type=1009) | timechart count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<title>Authentication Events by Device</title>
|
|
||||||
<chart>
|
|
||||||
<title>Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1000 OR type=1001 OR type=1002 OR type=1003 OR type=1004 OR type=1005 OR type=1006 OR type=1008 OR type=1009) | stats count by deviceName</query>
|
|
||||||
<earliest>-24h@h</earliest>
|
|
||||||
<latest>now</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<title>Authentication Events by Type</title>
|
|
||||||
<chart>
|
|
||||||
<title>Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1000 OR type=1001 OR type=1002 OR type=1003 OR type=1004 OR type=1005 OR type=1006 OR type=1008 OR type=1009) | stats count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Top Failed Log In Attempts</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1005 OR type=1006) | stats count by $top_users_by$ | sort - count</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.axisLabelsX.majorLabelStyle.rotation">45</option>
|
|
||||||
<option name="charting.axisTitleX.text">Acting User</option>
|
|
||||||
<option name="charting.chart">column</option>
|
|
||||||
<option name="charting.chart.showDataLabels">all</option>
|
|
||||||
<option name="charting.chart.stackMode">default</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Top Successful Log In Attempts</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" type=1000 | stats count by $top_users_by$ | sort - count</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.axisLabelsX.majorLabelStyle.rotation">45</option>
|
|
||||||
<option name="charting.axisTitleX.text">Acting User</option>
|
|
||||||
<option name="charting.chart">column</option>
|
|
||||||
<option name="charting.chart.stackMode">default</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<event>
|
|
||||||
<title>Latest Authentication Events</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" (type=1000 OR type=1001 OR type=1002 OR type=1003 OR type=1004 OR type=1005 OR type=1006 OR type=1008 OR type=1009)
|
|
||||||
</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="list.drilldown">none</option>
|
|
||||||
</event>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
</form>
|
|
||||||
@ -1,126 +0,0 @@
|
|||||||
<form version="1.1">
|
|
||||||
<label>Bitwarden Organization Events</label>
|
|
||||||
<fieldset submitButton="false" autoRun="false">
|
|
||||||
<input type="time" token="timeframe">
|
|
||||||
<label>Timeframe</label>
|
|
||||||
<default>
|
|
||||||
<earliest>-24h@h</earliest>
|
|
||||||
<latest>now</latest>
|
|
||||||
</default>
|
|
||||||
</input>
|
|
||||||
<input type="dropdown" token="top_users_by">
|
|
||||||
<label>Top Users By</label>
|
|
||||||
<default>User Email</default>
|
|
||||||
<choice value="actingUserId">User ID</choice>
|
|
||||||
<choice value="actingUserEmail">User Email</choice>
|
|
||||||
<choice value="actingUserName">User Name</choice>
|
|
||||||
</input>
|
|
||||||
</fieldset>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<map>
|
|
||||||
<title>Organization Events</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | iplocation ipAddress | lookup geo_countries longitude as lon, latitude as lat | stats count by Country | geom geo_countries featureIdField=Country</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="drilldown">none</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorBins">5</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorMode">auto</option>
|
|
||||||
<option name="mapping.choroplethLayer.maximumColor">0x53a051</option>
|
|
||||||
<option name="mapping.map.center">(55.97,-40.69)</option>
|
|
||||||
<option name="mapping.map.zoom">3</option>
|
|
||||||
<option name="mapping.type">choropleth</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</map>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Organization Events by Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | timechart count by deviceName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Organization Events by Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | timechart count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>></search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<title>Organization Events by Device</title>
|
|
||||||
<chart>
|
|
||||||
<title>Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | stats count by deviceName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<title>Organization Events by Type</title>
|
|
||||||
<chart>
|
|
||||||
<title>Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | stats count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Top Organization Event Users</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]> | stats count by $top_users_by$ | sort - count</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.axisLabelsX.majorLabelStyle.rotation">90</option>
|
|
||||||
<option name="charting.axisTitleX.text">Acting User</option>
|
|
||||||
<option name="charting.chart">column</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<event>
|
|
||||||
<title>Latest Organization Events</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1300 AND type<1800)]]></query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="list.drilldown">none</option>
|
|
||||||
</event>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
</form>
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
<dashboard isDashboard="false"
|
|
||||||
script="setup/runtime.js,setup/polyfills.js,setup/scripts.js,setup/main.js"
|
|
||||||
stylesheet="setup/styles.css"
|
|
||||||
version="1.1">
|
|
||||||
<label>Setup</label>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<html>
|
|
||||||
<div id="app-root"></div>
|
|
||||||
</html>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
</dashboard>
|
|
||||||
@ -1,127 +0,0 @@
|
|||||||
<form version="1.1">
|
|
||||||
<label>Bitwarden Vault Item Events</label>
|
|
||||||
<fieldset submitButton="false" autoRun="false">
|
|
||||||
<input type="time" token="timeframe">
|
|
||||||
<label>Timeframe</label>
|
|
||||||
<default>
|
|
||||||
<earliest>-24h@h</earliest>
|
|
||||||
<latest>now</latest>
|
|
||||||
</default>
|
|
||||||
</input>
|
|
||||||
<input type="dropdown" token="top_users_by">
|
|
||||||
<label>Top Users By</label>
|
|
||||||
<default>User Email</default>
|
|
||||||
<choice value="actingUserId">User ID</choice>
|
|
||||||
<choice value="actingUserEmail">User Email</choice>
|
|
||||||
<choice value="actingUserName">User Name</choice>
|
|
||||||
</input>
|
|
||||||
</fieldset>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<map>
|
|
||||||
<title>Vault Item Events</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | iplocation ipAddress | lookup geo_countries longitude as lon, latitude as lat | stats count by Country | geom geo_countries featureIdField=Country</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="drilldown">none</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorBins">5</option>
|
|
||||||
<option name="mapping.choroplethLayer.colorMode">auto</option>
|
|
||||||
<option name="mapping.choroplethLayer.maximumColor">0x53a051</option>
|
|
||||||
<option name="mapping.map.center">(55.97,-40.69)</option>
|
|
||||||
<option name="mapping.map.zoom">3</option>
|
|
||||||
<option name="mapping.type">choropleth</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</map>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Vault Item Events by Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | timechart count by deviceName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Vault Item Events by Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | timechart count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>></search>
|
|
||||||
<option name="charting.chart">line</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<title>Vault Item Events by Device</title>
|
|
||||||
<chart>
|
|
||||||
<title>Device</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | stats count by deviceName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
<panel>
|
|
||||||
<title>Vault Item Events by Type</title>
|
|
||||||
<chart>
|
|
||||||
<title>Type</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | stats count by typeName</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.chart">pie</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<chart>
|
|
||||||
<title>Top Vault Item Event Users</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]> | stats count by $top_users_by$ | sort - count</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="charting.axisLabelsX.majorLabelStyle.rotation">90</option>
|
|
||||||
<option name="charting.axisTitleX.text">Acting User</option>
|
|
||||||
<option name="charting.chart">column</option>
|
|
||||||
<option name="charting.drilldown">none</option>
|
|
||||||
<option name="refresh.display">progressbar</option>
|
|
||||||
</chart>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
<row>
|
|
||||||
<panel>
|
|
||||||
<event>
|
|
||||||
<title>Latest Vault Item Events</title>
|
|
||||||
<search>
|
|
||||||
<query>`bitwarden_event_logs_index` sourcetype="bitwarden:events" <![CDATA[(type>=1100 AND type<1200)]]>
|
|
||||||
</query>
|
|
||||||
<earliest>$timeframe.earliest$</earliest>
|
|
||||||
<latest>$timeframe.latest$</latest>
|
|
||||||
</search>
|
|
||||||
<option name="list.drilldown">none</option>
|
|
||||||
</event>
|
|
||||||
</panel>
|
|
||||||
</row>
|
|
||||||
</form>
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
[script://$SPLUNK_HOME/etc/apps/bitwarden_event_logs/bin/bitwarden_event_logs.py]
|
|
||||||
interval = 60
|
|
||||||
sourcetype = bitwarden:events
|
|
||||||
index = main
|
|
||||||
passAuth = splunk-system-user
|
|
||||||
python.version = python3
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
[bitwarden_event_logs_index]
|
|
||||||
definition=index=*
|
|
||||||
@ -1,138 +0,0 @@
|
|||||||
[bitwarden:events]
|
|
||||||
LINE_BREAKER = ([\r\n])
|
|
||||||
SHOULD_LINEMERGE = false
|
|
||||||
TRUNCATE = 5000
|
|
||||||
KV_MODE = json
|
|
||||||
FIELDALIAS-alias_1 = ipAddress AS src
|
|
||||||
FIELDALIAS-alias_2 = date AS timestamp
|
|
||||||
EVAL-typeName = coalesce(case(\
|
|
||||||
type==1000,"User_LoggedIn",\
|
|
||||||
type==1001,"User_ChangedPassword",\
|
|
||||||
type==1002,"User_Updated2fa",\
|
|
||||||
type==1003,"User_Disabled2fa",\
|
|
||||||
type==1004,"User_Recovered2fa",\
|
|
||||||
type==1005,"User_FailedLogIn",\
|
|
||||||
type==1006,"User_FailedLogIn2fa",\
|
|
||||||
type==1007,"User_ClientExportedVault",\
|
|
||||||
type==1008,"User_UpdatedTempPassword",\
|
|
||||||
type==1009,"User_MigratedKeyToKeyConnector",\
|
|
||||||
type==1010,"User_RequestedDeviceApproval",\
|
|
||||||
type==1011,"User_TdeOffboardingPasswordSet",\
|
|
||||||
type==1100,"Cipher_Created",\
|
|
||||||
type==1101,"Cipher_Updated",\
|
|
||||||
type==1102,"Cipher_Deleted",\
|
|
||||||
type==1103,"Cipher_AttachmentCreated",\
|
|
||||||
type==1104,"Cipher_AttachmentDeleted",\
|
|
||||||
type==1105,"Cipher_Shared",\
|
|
||||||
type==1106,"Cipher_UpdatedCollections",\
|
|
||||||
type==1107,"Cipher_ClientViewed",\
|
|
||||||
type==1108,"Cipher_ClientToggledPasswordVisible",\
|
|
||||||
type==1109,"Cipher_ClientToggledHiddenFieldVisible",\
|
|
||||||
type==1110,"Cipher_ClientToggledCardCodeVisible",\
|
|
||||||
type==1111,"Cipher_ClientCopiedPassword",\
|
|
||||||
type==1112,"Cipher_ClientCopiedHiddenField",\
|
|
||||||
type==1113,"Cipher_ClientCopiedCardCode",\
|
|
||||||
type==1114,"Cipher_ClientAutofilled",\
|
|
||||||
type==1115,"Cipher_SoftDeleted",\
|
|
||||||
type==1116,"Cipher_Restored",\
|
|
||||||
type==1117,"Cipher_ClientToggledCardNumberVisible",\
|
|
||||||
type==1300,"Collection_Created",\
|
|
||||||
type==1301,"Collection_Updated",\
|
|
||||||
type==1302,"Collection_Deleted",\
|
|
||||||
type==1400,"Group_Created",\
|
|
||||||
type==1401,"Group_Updated",\
|
|
||||||
type==1402,"Group_Deleted",\
|
|
||||||
type==1500,"OrganizationUser_Invited",\
|
|
||||||
type==1501,"OrganizationUser_Confirmed",\
|
|
||||||
type==1502,"OrganizationUser_Updated",\
|
|
||||||
type==1503,"OrganizationUser_Removed",\
|
|
||||||
type==1504,"OrganizationUser_UpdatedGroups",\
|
|
||||||
type==1505,"OrganizationUser_UnlinkedSso",\
|
|
||||||
type==1506,"OrganizationUser_ResetPassword_Enroll",\
|
|
||||||
type==1507,"OrganizationUser_ResetPassword_Withdraw",\
|
|
||||||
type==1508,"OrganizationUser_AdminResetPassword",\
|
|
||||||
type==1509,"OrganizationUser_ResetSsoLink",\
|
|
||||||
type==1510,"OrganizationUser_FirstSsoLogin",\
|
|
||||||
type==1511,"OrganizationUser_Revoked",\
|
|
||||||
type==1512,"OrganizationUser_Restored",\
|
|
||||||
type==1513,"OrganizationUser_ApprovedAuthRequest",\
|
|
||||||
type==1514,"OrganizationUser_RejectedAuthRequest",\
|
|
||||||
type==1515,"OrganizationUser_Deleted",\
|
|
||||||
type==1516,"OrganizationUser_Left",\
|
|
||||||
type==1517,"OrganizationUser_AutomaticallyConfirmed",\
|
|
||||||
type==1600,"Organization_Updated",\
|
|
||||||
type==1601,"Organization_PurgedVault",\
|
|
||||||
type==1602,"Organization_ClientExportedVault",\
|
|
||||||
type==1603,"Organization_VaultAccessed",\
|
|
||||||
type==1604,"Organization_EnabledSso",\
|
|
||||||
type==1605,"Organization_DisabledSso",\
|
|
||||||
type==1606,"Organization_EnabledKeyConnector",\
|
|
||||||
type==1607,"Organization_DisabledKeyConnector",\
|
|
||||||
type==1608,"Organization_SponsorshipsSynced",\
|
|
||||||
type==1609,"Organization_CollectionManagement_Updated",\
|
|
||||||
type==1610,"Organization_CollectionManagement_LimitCollectionCreationEnabled",\
|
|
||||||
type==1611,"Organization_CollectionManagement_LimitCollectionCreationDisabled",\
|
|
||||||
type==1612,"Organization_CollectionManagement_LimitCollectionDeletionEnabled",\
|
|
||||||
type==1613,"Organization_CollectionManagement_LimitCollectionDeletionDisabled",\
|
|
||||||
type==1614,"Organization_CollectionManagement_LimitItemDeletionEnabled",\
|
|
||||||
type==1615,"Organization_CollectionManagement_LimitItemDeletionDisabled",\
|
|
||||||
type==1616,"Organization_CollectionManagement_AllowAdminAccessToAllCollectionItemsEnabled",\
|
|
||||||
type==1617,"Organization_CollectionManagement_AllowAdminAccessToAllCollectionItemsDisabled",\
|
|
||||||
type==1620,"Organization_AutoConfirmEnabled_Admin",\
|
|
||||||
type==1621,"Organization_AutoConfirmDisabled_Admin",\
|
|
||||||
type==1622,"Organization_AutoConfirmEnabled_Portal",\
|
|
||||||
type==1623,"Organization_AutoConfirmDisabled_Portal",\
|
|
||||||
type==1700,"Policy_Updated",\
|
|
||||||
type==1800,"ProviderUser_Invited",\
|
|
||||||
type==1801,"ProviderUser_Confirmed",\
|
|
||||||
type==1802,"ProviderUser_Updated",\
|
|
||||||
type==1803,"ProviderUser_Removed",\
|
|
||||||
type==1900,"ProviderOrganization_Created",\
|
|
||||||
type==1901,"ProviderOrganization_Added",\
|
|
||||||
type==1902,"ProviderOrganization_Removed",\
|
|
||||||
type==1903,"ProviderOrganization_VaultAccessed",\
|
|
||||||
type==2000,"OrganizationDomain_Added",\
|
|
||||||
type==2001,"OrganizationDomain_Removed",\
|
|
||||||
type==2002,"OrganizationDomain_Verified",\
|
|
||||||
type==2003,"OrganizationDomain_NotVerified",\
|
|
||||||
type==2100,"Secret_Retrieved",\
|
|
||||||
type==2101,"Secret_Created",\
|
|
||||||
type==2102,"Secret_Edited",\
|
|
||||||
type==2103,"Secret_Deleted",\
|
|
||||||
type==2104,"Secret_Permanently_Deleted",\
|
|
||||||
type==2105,"Secret_Restored",\
|
|
||||||
type==2200,"Project_Retrieved",\
|
|
||||||
type==2201,"Project_Created",\
|
|
||||||
type==2202,"Project_Edited",\
|
|
||||||
type==2203,"Project_Deleted"\
|
|
||||||
), type)
|
|
||||||
EVAL-deviceName = coalesce(case(device==0,"Android",\
|
|
||||||
device==1,"iOS",\
|
|
||||||
device==2,"Chrome Extension",\
|
|
||||||
device==3,"Firefox Extension",\
|
|
||||||
device==4,"Opera Extension",\
|
|
||||||
device==5,"Edge Extension",\
|
|
||||||
device==6,"Windows Desktop",\
|
|
||||||
device==7,"macOS Desktop",\
|
|
||||||
device==8,"Linux Desktop",\
|
|
||||||
device==9,"Chrome Browser",\
|
|
||||||
device==10,"Firefox Browser",\
|
|
||||||
device==11,"Opera Browser",\
|
|
||||||
device==12,"Edge Browser",\
|
|
||||||
device==13,"IEBrowser",\
|
|
||||||
device==14,"Unknown Browser",\
|
|
||||||
device==15,"Android Amazon",\
|
|
||||||
device==16,"UWP",\
|
|
||||||
device==17,"Safari Browser",\
|
|
||||||
device==18,"Vivaldi Browser",\
|
|
||||||
device==19,"Vivaldi Extension",\
|
|
||||||
device==20,"Safari Extension",\
|
|
||||||
device==21,"SDK",\
|
|
||||||
device==22,"Server",\
|
|
||||||
device==23,"Windows CLI",\
|
|
||||||
device==24,"MacOs CLI",\
|
|
||||||
device==25,"Linux CLI",\
|
|
||||||
device==26,"DuckDuckGo"\
|
|
||||||
), device)
|
|
||||||
TIME_PREFIX = "date":"
|
|
||||||
TIME_FORMAT = %Y-%m-%dT%H:%M:%S.%6N%Z
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
[config]
|
|
||||||
apiUrl=https://api.bitwarden.com
|
|
||||||
identityUrl=https://identity.bitwarden.com
|
|
||||||
loggingLevel=INFO
|
|
||||||
@ -1,3 +0,0 @@
|
|||||||
[shclustering]
|
|
||||||
conf_replication_include.script = true
|
|
||||||
|
|
||||||
@ -1 +0,0 @@
|
|||||||
pip
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
Copyright 2006 Dan-Haim. All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification,
|
|
||||||
are permitted provided that the following conditions are met:
|
|
||||||
1. Redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer.
|
|
||||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
this list of conditions and the following disclaimer in the documentation
|
|
||||||
and/or other materials provided with the distribution.
|
|
||||||
3. Neither the name of Dan Haim nor the names of his contributors may be used
|
|
||||||
to endorse or promote products derived from this software without specific
|
|
||||||
prior written permission.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED
|
|
||||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
||||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
||||||
EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
||||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA
|
|
||||||
OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE.
|
|
||||||
@ -1,321 +0,0 @@
|
|||||||
Metadata-Version: 2.1
|
|
||||||
Name: PySocks
|
|
||||||
Version: 1.7.1
|
|
||||||
Summary: A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information.
|
|
||||||
Home-page: https://github.com/Anorov/PySocks
|
|
||||||
Author: Anorov
|
|
||||||
Author-email: anorov.vorona@gmail.com
|
|
||||||
License: BSD
|
|
||||||
Keywords: socks,proxy
|
|
||||||
Platform: UNKNOWN
|
|
||||||
Classifier: Programming Language :: Python :: 2
|
|
||||||
Classifier: Programming Language :: Python :: 2.7
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3.4
|
|
||||||
Classifier: Programming Language :: Python :: 3.5
|
|
||||||
Classifier: Programming Language :: Python :: 3.6
|
|
||||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
|
|
||||||
Description-Content-Type: text/markdown
|
|
||||||
|
|
||||||
PySocks
|
|
||||||
=======
|
|
||||||
|
|
||||||
PySocks lets you send traffic through SOCKS and HTTP proxy servers. It is a modern fork of [SocksiPy](http://socksipy.sourceforge.net/) with bug fixes and extra features.
|
|
||||||
|
|
||||||
Acts as a drop-in replacement to the socket module. Seamlessly configure SOCKS proxies for any socket object by calling `socket_object.set_proxy()`.
|
|
||||||
|
|
||||||
----------------
|
|
||||||
|
|
||||||
Features
|
|
||||||
========
|
|
||||||
|
|
||||||
* SOCKS proxy client for Python 2.7 and 3.4+
|
|
||||||
* TCP supported
|
|
||||||
* UDP mostly supported (issues may occur in some edge cases)
|
|
||||||
* HTTP proxy client included but not supported or recommended (you should use urllib2's or requests' own HTTP proxy interface)
|
|
||||||
* urllib2 handler included. `pip install` / `setup.py install` will automatically install the `sockshandler` module.
|
|
||||||
|
|
||||||
Installation
|
|
||||||
============
|
|
||||||
|
|
||||||
pip install PySocks
|
|
||||||
|
|
||||||
Or download the tarball / `git clone` and...
|
|
||||||
|
|
||||||
python setup.py install
|
|
||||||
|
|
||||||
These will install both the `socks` and `sockshandler` modules.
|
|
||||||
|
|
||||||
Alternatively, include just `socks.py` in your project.
|
|
||||||
|
|
||||||
--------------------------------------------
|
|
||||||
|
|
||||||
*Warning:* PySocks/SocksiPy only supports HTTP proxies that use CONNECT tunneling. Certain HTTP proxies may not work with this library. If you wish to use HTTP (not SOCKS) proxies, it is recommended that you rely on your HTTP client's native proxy support (`proxies` dict for `requests`, or `urllib2.ProxyHandler` for `urllib2`) instead.
|
|
||||||
|
|
||||||
--------------------------------------------
|
|
||||||
|
|
||||||
Usage
|
|
||||||
=====
|
|
||||||
|
|
||||||
## socks.socksocket ##
|
|
||||||
|
|
||||||
import socks
|
|
||||||
|
|
||||||
s = socks.socksocket() # Same API as socket.socket in the standard lib
|
|
||||||
|
|
||||||
s.set_proxy(socks.SOCKS5, "localhost") # SOCKS4 and SOCKS5 use port 1080 by default
|
|
||||||
# Or
|
|
||||||
s.set_proxy(socks.SOCKS4, "localhost", 4444)
|
|
||||||
# Or
|
|
||||||
s.set_proxy(socks.HTTP, "5.5.5.5", 8888)
|
|
||||||
|
|
||||||
# Can be treated identical to a regular socket object
|
|
||||||
s.connect(("www.somesite.com", 80))
|
|
||||||
s.sendall("GET / HTTP/1.1 ...")
|
|
||||||
print s.recv(4096)
|
|
||||||
|
|
||||||
## Monkeypatching ##
|
|
||||||
|
|
||||||
To monkeypatch the entire standard library with a single default proxy:
|
|
||||||
|
|
||||||
import urllib2
|
|
||||||
import socket
|
|
||||||
import socks
|
|
||||||
|
|
||||||
socks.set_default_proxy(socks.SOCKS5, "localhost")
|
|
||||||
socket.socket = socks.socksocket
|
|
||||||
|
|
||||||
urllib2.urlopen("http://www.somesite.com/") # All requests will pass through the SOCKS proxy
|
|
||||||
|
|
||||||
Note that monkeypatching may not work for all standard modules or for all third party modules, and generally isn't recommended. Monkeypatching is usually an anti-pattern in Python.
|
|
||||||
|
|
||||||
## urllib2 Handler ##
|
|
||||||
|
|
||||||
Example use case with the `sockshandler` urllib2 handler. Note that you must import both `socks` and `sockshandler`, as the handler is its own module separate from PySocks. The module is included in the PyPI package.
|
|
||||||
|
|
||||||
import urllib2
|
|
||||||
import socks
|
|
||||||
from sockshandler import SocksiPyHandler
|
|
||||||
|
|
||||||
opener = urllib2.build_opener(SocksiPyHandler(socks.SOCKS5, "127.0.0.1", 9050))
|
|
||||||
print opener.open("http://www.somesite.com/") # All requests made by the opener will pass through the SOCKS proxy
|
|
||||||
|
|
||||||
--------------------------------------------
|
|
||||||
|
|
||||||
Original SocksiPy README attached below, amended to reflect API changes.
|
|
||||||
|
|
||||||
--------------------------------------------
|
|
||||||
|
|
||||||
SocksiPy
|
|
||||||
|
|
||||||
A Python SOCKS module.
|
|
||||||
|
|
||||||
(C) 2006 Dan-Haim. All rights reserved.
|
|
||||||
|
|
||||||
See LICENSE file for details.
|
|
||||||
|
|
||||||
|
|
||||||
*WHAT IS A SOCKS PROXY?*
|
|
||||||
|
|
||||||
A SOCKS proxy is a proxy server at the TCP level. In other words, it acts as
|
|
||||||
a tunnel, relaying all traffic going through it without modifying it.
|
|
||||||
SOCKS proxies can be used to relay traffic using any network protocol that
|
|
||||||
uses TCP.
|
|
||||||
|
|
||||||
*WHAT IS SOCKSIPY?*
|
|
||||||
|
|
||||||
This Python module allows you to create TCP connections through a SOCKS
|
|
||||||
proxy without any special effort.
|
|
||||||
It also supports relaying UDP packets with a SOCKS5 proxy.
|
|
||||||
|
|
||||||
*PROXY COMPATIBILITY*
|
|
||||||
|
|
||||||
SocksiPy is compatible with three different types of proxies:
|
|
||||||
|
|
||||||
1. SOCKS Version 4 (SOCKS4), including the SOCKS4a extension.
|
|
||||||
2. SOCKS Version 5 (SOCKS5).
|
|
||||||
3. HTTP Proxies which support tunneling using the CONNECT method.
|
|
||||||
|
|
||||||
*SYSTEM REQUIREMENTS*
|
|
||||||
|
|
||||||
Being written in Python, SocksiPy can run on any platform that has a Python
|
|
||||||
interpreter and TCP/IP support.
|
|
||||||
This module has been tested with Python 2.3 and should work with greater versions
|
|
||||||
just as well.
|
|
||||||
|
|
||||||
|
|
||||||
INSTALLATION
|
|
||||||
-------------
|
|
||||||
|
|
||||||
Simply copy the file "socks.py" to your Python's `lib/site-packages` directory,
|
|
||||||
and you're ready to go. [Editor's note: it is better to use `python setup.py install` for PySocks]
|
|
||||||
|
|
||||||
|
|
||||||
USAGE
|
|
||||||
------
|
|
||||||
|
|
||||||
First load the socks module with the command:
|
|
||||||
|
|
||||||
>>> import socks
|
|
||||||
>>>
|
|
||||||
|
|
||||||
The socks module provides a class called `socksocket`, which is the base to all of the module's functionality.
|
|
||||||
|
|
||||||
The `socksocket` object has the same initialization parameters as the normal socket
|
|
||||||
object to ensure maximal compatibility, however it should be noted that `socksocket` will only function with family being `AF_INET` and
|
|
||||||
type being either `SOCK_STREAM` or `SOCK_DGRAM`.
|
|
||||||
Generally, it is best to initialize the `socksocket` object with no parameters
|
|
||||||
|
|
||||||
>>> s = socks.socksocket()
|
|
||||||
>>>
|
|
||||||
|
|
||||||
The `socksocket` object has an interface which is very similiar to socket's (in fact
|
|
||||||
the `socksocket` class is derived from socket) with a few extra methods.
|
|
||||||
To select the proxy server you would like to use, use the `set_proxy` method, whose
|
|
||||||
syntax is:
|
|
||||||
|
|
||||||
set_proxy(proxy_type, addr[, port[, rdns[, username[, password]]]])
|
|
||||||
|
|
||||||
Explanation of the parameters:
|
|
||||||
|
|
||||||
`proxy_type` - The type of the proxy server. This can be one of three possible
|
|
||||||
choices: `PROXY_TYPE_SOCKS4`, `PROXY_TYPE_SOCKS5` and `PROXY_TYPE_HTTP` for SOCKS4,
|
|
||||||
SOCKS5 and HTTP servers respectively. `SOCKS4`, `SOCKS5`, and `HTTP` are all aliases, respectively.
|
|
||||||
|
|
||||||
`addr` - The IP address or DNS name of the proxy server.
|
|
||||||
|
|
||||||
`port` - The port of the proxy server. Defaults to 1080 for socks and 8080 for http.
|
|
||||||
|
|
||||||
`rdns` - This is a boolean flag than modifies the behavior regarding DNS resolving.
|
|
||||||
If it is set to True, DNS resolving will be preformed remotely, on the server.
|
|
||||||
If it is set to False, DNS resolving will be preformed locally. Please note that
|
|
||||||
setting this to True with SOCKS4 servers actually use an extension to the protocol,
|
|
||||||
called SOCKS4a, which may not be supported on all servers (SOCKS5 and http servers
|
|
||||||
always support DNS). The default is True.
|
|
||||||
|
|
||||||
`username` - For SOCKS5 servers, this allows simple username / password authentication
|
|
||||||
with the server. For SOCKS4 servers, this parameter will be sent as the userid.
|
|
||||||
This parameter is ignored if an HTTP server is being used. If it is not provided,
|
|
||||||
authentication will not be used (servers may accept unauthenticated requests).
|
|
||||||
|
|
||||||
`password` - This parameter is valid only for SOCKS5 servers and specifies the
|
|
||||||
respective password for the username provided.
|
|
||||||
|
|
||||||
Example of usage:
|
|
||||||
|
|
||||||
>>> s.set_proxy(socks.SOCKS5, "socks.example.com") # uses default port 1080
|
|
||||||
>>> s.set_proxy(socks.SOCKS4, "socks.test.com", 1081)
|
|
||||||
|
|
||||||
After the set_proxy method has been called, simply call the connect method with the
|
|
||||||
traditional parameters to establish a connection through the proxy:
|
|
||||||
|
|
||||||
>>> s.connect(("www.sourceforge.net", 80))
|
|
||||||
>>>
|
|
||||||
|
|
||||||
Connection will take a bit longer to allow negotiation with the proxy server.
|
|
||||||
Please note that calling connect without calling `set_proxy` earlier will connect
|
|
||||||
without a proxy (just like a regular socket).
|
|
||||||
|
|
||||||
Errors: Any errors in the connection process will trigger exceptions. The exception
|
|
||||||
may either be generated by the underlying socket layer or may be custom module
|
|
||||||
exceptions, whose details follow:
|
|
||||||
|
|
||||||
class `ProxyError` - This is a base exception class. It is not raised directly but
|
|
||||||
rather all other exception classes raised by this module are derived from it.
|
|
||||||
This allows an easy way to catch all proxy-related errors. It descends from `IOError`.
|
|
||||||
|
|
||||||
All `ProxyError` exceptions have an attribute `socket_err`, which will contain either a
|
|
||||||
caught `socket.error` exception, or `None` if there wasn't any.
|
|
||||||
|
|
||||||
class `GeneralProxyError` - When thrown, it indicates a problem which does not fall
|
|
||||||
into another category.
|
|
||||||
|
|
||||||
* `Sent invalid data` - This error means that unexpected data has been received from
|
|
||||||
the server. The most common reason is that the server specified as the proxy is
|
|
||||||
not really a SOCKS4/SOCKS5/HTTP proxy, or maybe the proxy type specified is wrong.
|
|
||||||
|
|
||||||
* `Connection closed unexpectedly` - The proxy server unexpectedly closed the connection.
|
|
||||||
This may indicate that the proxy server is experiencing network or software problems.
|
|
||||||
|
|
||||||
* `Bad proxy type` - This will be raised if the type of the proxy supplied to the
|
|
||||||
set_proxy function was not one of `SOCKS4`/`SOCKS5`/`HTTP`.
|
|
||||||
|
|
||||||
* `Bad input` - This will be raised if the `connect()` method is called with bad input
|
|
||||||
parameters.
|
|
||||||
|
|
||||||
class `SOCKS5AuthError` - This indicates that the connection through a SOCKS5 server
|
|
||||||
failed due to an authentication problem.
|
|
||||||
|
|
||||||
* `Authentication is required` - This will happen if you use a SOCKS5 server which
|
|
||||||
requires authentication without providing a username / password at all.
|
|
||||||
|
|
||||||
* `All offered authentication methods were rejected` - This will happen if the proxy
|
|
||||||
requires a special authentication method which is not supported by this module.
|
|
||||||
|
|
||||||
* `Unknown username or invalid password` - Self descriptive.
|
|
||||||
|
|
||||||
class `SOCKS5Error` - This will be raised for SOCKS5 errors which are not related to
|
|
||||||
authentication.
|
|
||||||
The parameter is a tuple containing a code, as given by the server,
|
|
||||||
and a description of the
|
|
||||||
error. The possible errors, according to the RFC, are:
|
|
||||||
|
|
||||||
* `0x01` - General SOCKS server failure - If for any reason the proxy server is unable to
|
|
||||||
fulfill your request (internal server error).
|
|
||||||
* `0x02` - connection not allowed by ruleset - If the address you're trying to connect to
|
|
||||||
is blacklisted on the server or requires authentication.
|
|
||||||
* `0x03` - Network unreachable - The target could not be contacted. A router on the network
|
|
||||||
had replied with a destination net unreachable error.
|
|
||||||
* `0x04` - Host unreachable - The target could not be contacted. A router on the network
|
|
||||||
had replied with a destination host unreachable error.
|
|
||||||
* `0x05` - Connection refused - The target server has actively refused the connection
|
|
||||||
(the requested port is closed).
|
|
||||||
* `0x06` - TTL expired - The TTL value of the SYN packet from the proxy to the target server
|
|
||||||
has expired. This usually means that there are network problems causing the packet
|
|
||||||
to be caught in a router-to-router "ping-pong".
|
|
||||||
* `0x07` - Command not supported - For instance if the server does not support UDP.
|
|
||||||
* `0x08` - Address type not supported - The client has provided an invalid address type.
|
|
||||||
When using this module, this error should not occur.
|
|
||||||
|
|
||||||
class `SOCKS4Error` - This will be raised for SOCKS4 errors. The parameter is a tuple
|
|
||||||
containing a code and a description of the error, as given by the server. The
|
|
||||||
possible error, according to the specification are:
|
|
||||||
|
|
||||||
* `0x5B` - Request rejected or failed - Will be raised in the event of an failure for any
|
|
||||||
reason other then the two mentioned next.
|
|
||||||
* `0x5C` - request rejected because SOCKS server cannot connect to identd on the client -
|
|
||||||
The Socks server had tried an ident lookup on your computer and has failed. In this
|
|
||||||
case you should run an identd server and/or configure your firewall to allow incoming
|
|
||||||
connections to local port 113 from the remote server.
|
|
||||||
* `0x5D` - request rejected because the client program and identd report different user-ids -
|
|
||||||
The Socks server had performed an ident lookup on your computer and has received a
|
|
||||||
different userid than the one you have provided. Change your userid (through the
|
|
||||||
username parameter of the set_proxy method) to match and try again.
|
|
||||||
|
|
||||||
class `HTTPError` - This will be raised for HTTP errors. The message will contain
|
|
||||||
the HTTP status code and provided error message.
|
|
||||||
|
|
||||||
After establishing the connection, the object behaves like a standard socket.
|
|
||||||
|
|
||||||
Methods like `makefile()` and `settimeout()` should behave just like regular sockets.
|
|
||||||
Call the `close()` method to close the connection.
|
|
||||||
|
|
||||||
In addition to the `socksocket` class, an additional function worth mentioning is the
|
|
||||||
`set_default_proxy` function. The parameters are the same as the `set_proxy` method.
|
|
||||||
This function will set default proxy settings for newly created `socksocket` objects,
|
|
||||||
in which the proxy settings haven't been changed via the `set_proxy` method.
|
|
||||||
This is quite useful if you wish to force 3rd party modules to use a SOCKS proxy,
|
|
||||||
by overriding the socket object.
|
|
||||||
For example:
|
|
||||||
|
|
||||||
>>> socks.set_default_proxy(socks.SOCKS5, "socks.example.com")
|
|
||||||
>>> socket.socket = socks.socksocket
|
|
||||||
>>> urllib.urlopen("http://www.sourceforge.net/")
|
|
||||||
|
|
||||||
|
|
||||||
PROBLEMS
|
|
||||||
---------
|
|
||||||
|
|
||||||
Please open a GitHub issue at https://github.com/Anorov/PySocks
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
PySocks-1.7.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
|
||||||
PySocks-1.7.1.dist-info/LICENSE,sha256=cCfiFOAU63i3rcwc7aWspxOnn8T2oMUsnaWz5wfm_-k,1401
|
|
||||||
PySocks-1.7.1.dist-info/METADATA,sha256=zbQMizjPOOP4DhEiEX24XXjNrYuIxF9UGUpN0uFDB6Y,13235
|
|
||||||
PySocks-1.7.1.dist-info/RECORD,,
|
|
||||||
PySocks-1.7.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
PySocks-1.7.1.dist-info/WHEEL,sha256=t_MpApv386-8PVts2R6wsTifdIn0vbUDTVv61IbqFC8,92
|
|
||||||
PySocks-1.7.1.dist-info/top_level.txt,sha256=TKSOIfCFBoK9EY8FBYbYqC3PWd3--G15ph9n8-QHPDk,19
|
|
||||||
socks.py,sha256=xOYn27t9IGrbTBzWsUUuPa0YBuplgiUykzkOB5V5iFY,31086
|
|
||||||
sockshandler.py,sha256=2SYGj-pwt1kjgLoZAmyeaEXCeZDWRmfVS_QG6kErGtY,3966
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
Wheel-Version: 1.0
|
|
||||||
Generator: bdist_wheel (0.33.3)
|
|
||||||
Root-Is-Purelib: true
|
|
||||||
Tag: py3-none-any
|
|
||||||
|
|
||||||
Binary file not shown.
Binary file not shown.
@ -1,6 +0,0 @@
|
|||||||
#!/home/runner/.cache/pypoetry/virtualenvs/bitwarden-event-logs-tER4sFWd-py3.9/bin/python3
|
|
||||||
import sys
|
|
||||||
from charset_normalizer.cli import cli_detect
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.argv[0] = sys.argv[0].removesuffix('.exe')
|
|
||||||
sys.exit(cli_detect())
|
|
||||||
@ -1 +0,0 @@
|
|||||||
pip
|
|
||||||
@ -1,78 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: certifi
|
|
||||||
Version: 2025.11.12
|
|
||||||
Summary: Python package for providing Mozilla's CA Bundle.
|
|
||||||
Home-page: https://github.com/certifi/python-certifi
|
|
||||||
Author: Kenneth Reitz
|
|
||||||
Author-email: me@kennethreitz.com
|
|
||||||
License: MPL-2.0
|
|
||||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
|
||||||
Classifier: Development Status :: 5 - Production/Stable
|
|
||||||
Classifier: Intended Audience :: Developers
|
|
||||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
|
||||||
Classifier: Natural Language :: English
|
|
||||||
Classifier: Programming Language :: Python
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3 :: Only
|
|
||||||
Classifier: Programming Language :: Python :: 3.7
|
|
||||||
Classifier: Programming Language :: Python :: 3.8
|
|
||||||
Classifier: Programming Language :: Python :: 3.9
|
|
||||||
Classifier: Programming Language :: Python :: 3.10
|
|
||||||
Classifier: Programming Language :: Python :: 3.11
|
|
||||||
Classifier: Programming Language :: Python :: 3.12
|
|
||||||
Classifier: Programming Language :: Python :: 3.13
|
|
||||||
Classifier: Programming Language :: Python :: 3.14
|
|
||||||
Requires-Python: >=3.7
|
|
||||||
License-File: LICENSE
|
|
||||||
Dynamic: author
|
|
||||||
Dynamic: author-email
|
|
||||||
Dynamic: classifier
|
|
||||||
Dynamic: description
|
|
||||||
Dynamic: home-page
|
|
||||||
Dynamic: license
|
|
||||||
Dynamic: license-file
|
|
||||||
Dynamic: project-url
|
|
||||||
Dynamic: requires-python
|
|
||||||
Dynamic: summary
|
|
||||||
|
|
||||||
Certifi: Python SSL Certificates
|
|
||||||
================================
|
|
||||||
|
|
||||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
|
||||||
validating the trustworthiness of SSL certificates while verifying the identity
|
|
||||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
|
||||||
|
|
||||||
Installation
|
|
||||||
------------
|
|
||||||
|
|
||||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
|
||||||
|
|
||||||
$ pip install certifi
|
|
||||||
|
|
||||||
Usage
|
|
||||||
-----
|
|
||||||
|
|
||||||
To reference the installed certificate authority (CA) bundle, you can use the
|
|
||||||
built-in function::
|
|
||||||
|
|
||||||
>>> import certifi
|
|
||||||
|
|
||||||
>>> certifi.where()
|
|
||||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
|
||||||
|
|
||||||
Or from the command line::
|
|
||||||
|
|
||||||
$ python -m certifi
|
|
||||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
|
||||||
|
|
||||||
Enjoy!
|
|
||||||
|
|
||||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
|
||||||
|
|
||||||
Addition/Removal of Certificates
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
Certifi does not support any addition/removal or other modification of the
|
|
||||||
CA trust store content. This project is intended to provide a reliable and
|
|
||||||
highly portable root of trust to python deployments. Look to upstream projects
|
|
||||||
for methods to use alternate trust.
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
certifi-2025.11.12.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
|
||||||
certifi-2025.11.12.dist-info/METADATA,sha256=_JprGu_1lWSdHlruRBKcorXnrfvBDhvX_6KRr8HQbLc,2475
|
|
||||||
certifi-2025.11.12.dist-info/RECORD,,
|
|
||||||
certifi-2025.11.12.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
certifi-2025.11.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
||||||
certifi-2025.11.12.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
|
||||||
certifi-2025.11.12.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
|
||||||
certifi/__init__.py,sha256=1BRSxNMnZW7CZ2oJtYWLoJgfHfcB9i273exwiPwfjJM,94
|
|
||||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
|
||||||
certifi/cacert.pem,sha256=oa1dZD4hxDtb7XTH4IkdzbWPavUcis4eTwINZUqlKhY,283932
|
|
||||||
certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
|
|
||||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
Wheel-Version: 1.0
|
|
||||||
Generator: setuptools (80.9.0)
|
|
||||||
Root-Is-Purelib: true
|
|
||||||
Tag: py3-none-any
|
|
||||||
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
This package contains a modified version of ca-bundle.crt:
|
|
||||||
|
|
||||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
|
||||||
|
|
||||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
|
||||||
(CA). These were automatically extracted from Mozilla's root certificates
|
|
||||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
|
||||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
|
||||||
It contains the certificates in PEM format and therefore
|
|
||||||
can be directly used with curl / libcurl / php_curl, or with
|
|
||||||
an Apache+mod_ssl webserver for SSL client authentication.
|
|
||||||
Just configure this file as the SSLCACertificateFile.#
|
|
||||||
|
|
||||||
***** BEGIN LICENSE BLOCK *****
|
|
||||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
|
||||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
|
||||||
one at http://mozilla.org/MPL/2.0/.
|
|
||||||
|
|
||||||
***** END LICENSE BLOCK *****
|
|
||||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
from .core import contents, where
|
|
||||||
|
|
||||||
__all__ = ["contents", "where"]
|
|
||||||
__version__ = "2025.11.12"
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
import argparse
|
|
||||||
|
|
||||||
from certifi import contents, where
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-c", "--contents", action="store_true")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.contents:
|
|
||||||
print(contents())
|
|
||||||
else:
|
|
||||||
print(where())
|
|
||||||
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -1,83 +0,0 @@
|
|||||||
"""
|
|
||||||
certifi.py
|
|
||||||
~~~~~~~~~~
|
|
||||||
|
|
||||||
This module returns the installation location of cacert.pem or its contents.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import atexit
|
|
||||||
|
|
||||||
def exit_cacert_ctx() -> None:
|
|
||||||
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info >= (3, 11):
|
|
||||||
|
|
||||||
from importlib.resources import as_file, files
|
|
||||||
|
|
||||||
_CACERT_CTX = None
|
|
||||||
_CACERT_PATH = None
|
|
||||||
|
|
||||||
def where() -> str:
|
|
||||||
# This is slightly terrible, but we want to delay extracting the file
|
|
||||||
# in cases where we're inside of a zipimport situation until someone
|
|
||||||
# actually calls where(), but we don't want to re-extract the file
|
|
||||||
# on every call of where(), so we'll do it once then store it in a
|
|
||||||
# global variable.
|
|
||||||
global _CACERT_CTX
|
|
||||||
global _CACERT_PATH
|
|
||||||
if _CACERT_PATH is None:
|
|
||||||
# This is slightly janky, the importlib.resources API wants you to
|
|
||||||
# manage the cleanup of this file, so it doesn't actually return a
|
|
||||||
# path, it returns a context manager that will give you the path
|
|
||||||
# when you enter it and will do any cleanup when you leave it. In
|
|
||||||
# the common case of not needing a temporary file, it will just
|
|
||||||
# return the file system location and the __exit__() is a no-op.
|
|
||||||
#
|
|
||||||
# We also have to hold onto the actual context manager, because
|
|
||||||
# it will do the cleanup whenever it gets garbage collected, so
|
|
||||||
# we will also store that at the global level as well.
|
|
||||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
|
||||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
|
||||||
atexit.register(exit_cacert_ctx)
|
|
||||||
|
|
||||||
return _CACERT_PATH
|
|
||||||
|
|
||||||
def contents() -> str:
|
|
||||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
|
||||||
|
|
||||||
else:
|
|
||||||
|
|
||||||
from importlib.resources import path as get_path, read_text
|
|
||||||
|
|
||||||
_CACERT_CTX = None
|
|
||||||
_CACERT_PATH = None
|
|
||||||
|
|
||||||
def where() -> str:
|
|
||||||
# This is slightly terrible, but we want to delay extracting the
|
|
||||||
# file in cases where we're inside of a zipimport situation until
|
|
||||||
# someone actually calls where(), but we don't want to re-extract
|
|
||||||
# the file on every call of where(), so we'll do it once then store
|
|
||||||
# it in a global variable.
|
|
||||||
global _CACERT_CTX
|
|
||||||
global _CACERT_PATH
|
|
||||||
if _CACERT_PATH is None:
|
|
||||||
# This is slightly janky, the importlib.resources API wants you
|
|
||||||
# to manage the cleanup of this file, so it doesn't actually
|
|
||||||
# return a path, it returns a context manager that will give
|
|
||||||
# you the path when you enter it and will do any cleanup when
|
|
||||||
# you leave it. In the common case of not needing a temporary
|
|
||||||
# file, it will just return the file system location and the
|
|
||||||
# __exit__() is a no-op.
|
|
||||||
#
|
|
||||||
# We also have to hold onto the actual context manager, because
|
|
||||||
# it will do the cleanup whenever it gets garbage collected, so
|
|
||||||
# we will also store that at the global level as well.
|
|
||||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
|
||||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
|
||||||
atexit.register(exit_cacert_ctx)
|
|
||||||
|
|
||||||
return _CACERT_PATH
|
|
||||||
|
|
||||||
def contents() -> str:
|
|
||||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
|
||||||
@ -1 +0,0 @@
|
|||||||
pip
|
|
||||||
@ -1,764 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: charset-normalizer
|
|
||||||
Version: 3.4.4
|
|
||||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
|
||||||
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
|
||||||
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
|
||||||
License: MIT
|
|
||||||
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
|
||||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
|
||||||
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
|
||||||
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
|
||||||
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
|
||||||
Classifier: Development Status :: 5 - Production/Stable
|
|
||||||
Classifier: Intended Audience :: Developers
|
|
||||||
Classifier: Operating System :: OS Independent
|
|
||||||
Classifier: Programming Language :: Python
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3.7
|
|
||||||
Classifier: Programming Language :: Python :: 3.8
|
|
||||||
Classifier: Programming Language :: Python :: 3.9
|
|
||||||
Classifier: Programming Language :: Python :: 3.10
|
|
||||||
Classifier: Programming Language :: Python :: 3.11
|
|
||||||
Classifier: Programming Language :: Python :: 3.12
|
|
||||||
Classifier: Programming Language :: Python :: 3.13
|
|
||||||
Classifier: Programming Language :: Python :: 3.14
|
|
||||||
Classifier: Programming Language :: Python :: 3 :: Only
|
|
||||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
||||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
||||||
Classifier: Topic :: Text Processing :: Linguistic
|
|
||||||
Classifier: Topic :: Utilities
|
|
||||||
Classifier: Typing :: Typed
|
|
||||||
Requires-Python: >=3.7
|
|
||||||
Description-Content-Type: text/markdown
|
|
||||||
License-File: LICENSE
|
|
||||||
Provides-Extra: unicode-backport
|
|
||||||
Dynamic: license-file
|
|
||||||
|
|
||||||
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
<sup>The Real First Universal Charset Detector</sup><br>
|
|
||||||
<a href="https://pypi.org/project/charset-normalizer">
|
|
||||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
|
||||||
</a>
|
|
||||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
|
||||||
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
|
||||||
</a>
|
|
||||||
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
|
||||||
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
|
||||||
</a>
|
|
||||||
</p>
|
|
||||||
<p align="center">
|
|
||||||
<sup><i>Featured Packages</i></sup><br>
|
|
||||||
<a href="https://github.com/jawah/niquests">
|
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
|
|
||||||
</a>
|
|
||||||
<a href="https://github.com/jawah/wassima">
|
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
|
|
||||||
</a>
|
|
||||||
</p>
|
|
||||||
<p align="center">
|
|
||||||
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
|
||||||
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
|
||||||
</a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
|
||||||
> I'm trying to resolve the issue by taking a new approach.
|
|
||||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
|
||||||
</p>
|
|
||||||
|
|
||||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
|
||||||
|
|
||||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
|
||||||
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
|
||||||
| `Fast` | ❌ | ✅ | ✅ |
|
|
||||||
| `Universal**` | ❌ | ✅ | ❌ |
|
|
||||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
|
||||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
|
||||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
|
||||||
| `Native Python` | ✅ | ✅ | ❌ |
|
|
||||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
|
||||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
|
||||||
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
|
||||||
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
|
||||||
|
|
||||||
## ⚡ Performance
|
|
||||||
|
|
||||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
|
||||||
|
|
||||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
|
||||||
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
|
||||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
|
||||||
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
|
||||||
|
|
||||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
|
||||||
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
|
||||||
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
|
||||||
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
|
||||||
|
|
||||||
_updated as of december 2024 using CPython 3.12_
|
|
||||||
|
|
||||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
|
||||||
|
|
||||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
|
||||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
|
||||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
|
||||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
|
||||||
> (e.g. Supported Encoding) Challenge-them if you want.
|
|
||||||
|
|
||||||
## ✨ Installation
|
|
||||||
|
|
||||||
Using pip:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
pip install charset-normalizer -U
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚀 Basic Usage
|
|
||||||
|
|
||||||
### CLI
|
|
||||||
This package comes with a CLI.
|
|
||||||
|
|
||||||
```
|
|
||||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
|
||||||
file [file ...]
|
|
||||||
|
|
||||||
The Real First Universal Charset Detector. Discover originating encoding used
|
|
||||||
on text file. Normalize text to unicode.
|
|
||||||
|
|
||||||
positional arguments:
|
|
||||||
files File(s) to be analysed
|
|
||||||
|
|
||||||
optional arguments:
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
-v, --verbose Display complementary information about file if any.
|
|
||||||
Stdout will contain logs about the detection process.
|
|
||||||
-a, --with-alternative
|
|
||||||
Output complementary possibilities if any. Top-level
|
|
||||||
JSON WILL be a list.
|
|
||||||
-n, --normalize Permit to normalize input file. If not set, program
|
|
||||||
does not write anything.
|
|
||||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
|
||||||
JSON output.
|
|
||||||
-r, --replace Replace file when trying to normalize it instead of
|
|
||||||
creating a new one.
|
|
||||||
-f, --force Replace file without asking if you are sure, use this
|
|
||||||
flag with caution.
|
|
||||||
-t THRESHOLD, --threshold THRESHOLD
|
|
||||||
Define a custom maximum amount of chaos allowed in
|
|
||||||
decoded content. 0. <= chaos <= 1.
|
|
||||||
--version Show version information and exit.
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
normalizer ./data/sample.1.fr.srt
|
|
||||||
```
|
|
||||||
|
|
||||||
or
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m charset_normalizer ./data/sample.1.fr.srt
|
|
||||||
```
|
|
||||||
|
|
||||||
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
|
||||||
"encoding": "cp1252",
|
|
||||||
"encoding_aliases": [
|
|
||||||
"1252",
|
|
||||||
"windows_1252"
|
|
||||||
],
|
|
||||||
"alternative_encodings": [
|
|
||||||
"cp1254",
|
|
||||||
"cp1256",
|
|
||||||
"cp1258",
|
|
||||||
"iso8859_14",
|
|
||||||
"iso8859_15",
|
|
||||||
"iso8859_16",
|
|
||||||
"iso8859_3",
|
|
||||||
"iso8859_9",
|
|
||||||
"latin_1",
|
|
||||||
"mbcs"
|
|
||||||
],
|
|
||||||
"language": "French",
|
|
||||||
"alphabets": [
|
|
||||||
"Basic Latin",
|
|
||||||
"Latin-1 Supplement"
|
|
||||||
],
|
|
||||||
"has_sig_or_bom": false,
|
|
||||||
"chaos": 0.149,
|
|
||||||
"coherence": 97.152,
|
|
||||||
"unicode_path": null,
|
|
||||||
"is_preferred": true
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Python
|
|
||||||
*Just print out normalized text*
|
|
||||||
```python
|
|
||||||
from charset_normalizer import from_path
|
|
||||||
|
|
||||||
results = from_path('./my_subtitle.srt')
|
|
||||||
|
|
||||||
print(str(results.best()))
|
|
||||||
```
|
|
||||||
|
|
||||||
*Upgrade your code without effort*
|
|
||||||
```python
|
|
||||||
from charset_normalizer import detect
|
|
||||||
```
|
|
||||||
|
|
||||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
|
||||||
|
|
||||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
|
||||||
|
|
||||||
## 😇 Why
|
|
||||||
|
|
||||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
|
||||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
|
||||||
|
|
||||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
|
||||||
produce **two identical rendered string.**
|
|
||||||
What I want is to get readable text, the best I can.
|
|
||||||
|
|
||||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
|
||||||
|
|
||||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
|
||||||
|
|
||||||
## 🍰 How
|
|
||||||
|
|
||||||
- Discard all charset encoding table that could not fit the binary content.
|
|
||||||
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
|
||||||
- Extract matches with the lowest mess detected.
|
|
||||||
- Additionally, we measure coherence / probe for a language.
|
|
||||||
|
|
||||||
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
|
||||||
|
|
||||||
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
|
||||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
|
||||||
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
|
||||||
improve or rewrite it.
|
|
||||||
|
|
||||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
|
||||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
|
||||||
|
|
||||||
## ⚡ Known limitations
|
|
||||||
|
|
||||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
|
||||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
|
||||||
|
|
||||||
## ⚠️ About Python EOLs
|
|
||||||
|
|
||||||
**If you are running:**
|
|
||||||
|
|
||||||
- Python >=2.7,<3.5: Unsupported
|
|
||||||
- Python 3.5: charset-normalizer < 2.1
|
|
||||||
- Python 3.6: charset-normalizer < 3.1
|
|
||||||
- Python 3.7: charset-normalizer < 4.0
|
|
||||||
|
|
||||||
Upgrade your Python interpreter as soon as possible.
|
|
||||||
|
|
||||||
## 👤 Contributing
|
|
||||||
|
|
||||||
Contributions, issues and feature requests are very much welcome.<br />
|
|
||||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
|
||||||
|
|
||||||
## 📝 License
|
|
||||||
|
|
||||||
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
|
||||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
|
||||||
|
|
||||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
|
||||||
|
|
||||||
## 💼 For Enterprise
|
|
||||||
|
|
||||||
Professional support for charset-normalizer is available as part of the [Tidelift
|
|
||||||
Subscription][1]. Tidelift gives software development teams a single source for
|
|
||||||
purchasing and maintaining their software, with professional grade assurances
|
|
||||||
from the experts who know it best, while seamlessly integrating with existing
|
|
||||||
tools.
|
|
||||||
|
|
||||||
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
|
||||||
|
|
||||||
[](https://www.bestpractices.dev/projects/7297)
|
|
||||||
|
|
||||||
# Changelog
|
|
||||||
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|
||||||
|
|
||||||
## [3.4.4](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.4) (2025-10-13)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Bound `setuptools` to a specific constraint `setuptools>=68,<=81`.
|
|
||||||
- Raised upper bound of mypyc for the optional pre-built extension to v1.18.2
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- `setuptools-scm` as a build dependency.
|
|
||||||
|
|
||||||
### Misc
|
|
||||||
- Enforced hashes in `dev-requirements.txt` and created `ci-requirements.txt` for security purposes.
|
|
||||||
- Additional pre-built wheels for riscv64, s390x, and armv7l architectures.
|
|
||||||
- Restore ` multiple.intoto.jsonl` in GitHub releases in addition to individual attestation file per wheel.
|
|
||||||
|
|
||||||
## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
|
|
||||||
- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
|
|
||||||
- Support for Python 3.14
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- sdist archive contained useless directories.
|
|
||||||
- automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
|
|
||||||
|
|
||||||
### Misc
|
|
||||||
- SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
|
|
||||||
Each published wheel comes with its SBOM. We choose CycloneDX as the format.
|
|
||||||
- Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
|
|
||||||
|
|
||||||
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
|
|
||||||
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
|
|
||||||
|
|
||||||
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
|
||||||
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
|
||||||
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- pre-commit configuration.
|
|
||||||
- noxfile.
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
|
||||||
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
|
||||||
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
|
||||||
- Unused `utils.range_scan` function.
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
|
||||||
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
|
||||||
|
|
||||||
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
|
||||||
- Support for Python 3.13 (#512)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
|
||||||
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
|
||||||
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
|
||||||
|
|
||||||
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
|
||||||
- Regression on some detection case showcased in the documentation (#371)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
|
||||||
|
|
||||||
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
|
||||||
- Improved the general detection reliability based on reports from the community
|
|
||||||
|
|
||||||
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
|
||||||
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
|
||||||
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
|
||||||
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
|
||||||
|
|
||||||
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
|
||||||
- Minor improvement over the global detection reliability
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
|
||||||
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
|
||||||
- Explicit support for Python 3.12
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
|
||||||
|
|
||||||
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Support for Python 3.6 (PR #260)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Optional speedup provided by mypy/c 1.0.1
|
|
||||||
|
|
||||||
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
|
||||||
|
|
||||||
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
|
||||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
|
||||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
|
||||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Build with static metadata using 'build' frontend
|
|
||||||
- Make the language detection stricter
|
|
||||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- CLI with opt --normalize fail when using full path for files
|
|
||||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
|
||||||
- Sphinx warnings when generating the documentation
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
|
||||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
|
||||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
|
||||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
|
||||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
|
||||||
- Breaking: Top-level function `normalize`
|
|
||||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
|
||||||
- Support for the backport `unicodedata2`
|
|
||||||
|
|
||||||
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
|
||||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
|
||||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Build with static metadata using 'build' frontend
|
|
||||||
- Make the language detection stricter
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- CLI with opt --normalize fail when using full path for files
|
|
||||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
|
||||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
|
||||||
|
|
||||||
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
|
||||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Sphinx warnings when generating the documentation
|
|
||||||
|
|
||||||
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
|
||||||
- Breaking: Top-level function `normalize`
|
|
||||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
|
||||||
- Support for the backport `unicodedata2`
|
|
||||||
|
|
||||||
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
|
||||||
|
|
||||||
### Deprecated
|
|
||||||
- Function `normalize` scheduled for removal in 3.0
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Removed useless call to decode in fn is_unprintable (#206)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
|
||||||
|
|
||||||
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
|
||||||
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
|
||||||
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Support for Python 3.5 (PR #192)
|
|
||||||
|
|
||||||
### Deprecated
|
|
||||||
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
|
||||||
|
|
||||||
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- ASCII miss-detection on rare cases (PR #170)
|
|
||||||
|
|
||||||
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Explicit support for Python 3.11 (PR #164)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
|
||||||
|
|
||||||
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Skipping the language-detection (CD) on ASCII (PR #155)
|
|
||||||
|
|
||||||
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
|
||||||
|
|
||||||
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
|
||||||
### Changed
|
|
||||||
- Improvement over Vietnamese detection (PR #126)
|
|
||||||
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
|
||||||
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
|
||||||
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
|
||||||
- Code style as refactored by Sourcery-AI (PR #131)
|
|
||||||
- Minor adjustment on the MD around european words (PR #133)
|
|
||||||
- Remove and replace SRTs from assets / tests (PR #139)
|
|
||||||
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
|
||||||
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
|
||||||
- Avoid using too insignificant chunk (PR #137)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
|
||||||
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
|
||||||
|
|
||||||
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
|
||||||
### Added
|
|
||||||
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
|
||||||
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
|
||||||
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
|
||||||
- Various detection improvement (MD+CD) (PR #117)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- Remove redundant logging entry about detected language(s) (PR #115)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
|
||||||
|
|
||||||
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
|
||||||
### Fixed
|
|
||||||
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
|
||||||
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
|
||||||
|
|
||||||
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
|
||||||
### Changed
|
|
||||||
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
|
||||||
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
|
||||||
- The Unicode detection is slightly improved (PR #93)
|
|
||||||
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
|
||||||
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
|
||||||
- The MANIFEST.in was not exhaustive (PR #78)
|
|
||||||
|
|
||||||
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
|
||||||
### Fixed
|
|
||||||
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
|
||||||
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
|
||||||
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
|
||||||
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
|
||||||
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
|
||||||
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
|
||||||
- Allow fallback on specified encoding if any (PR #71)
|
|
||||||
|
|
||||||
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
|
||||||
### Changed
|
|
||||||
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
|
||||||
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
|
||||||
|
|
||||||
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
|
||||||
### Fixed
|
|
||||||
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
|
||||||
|
|
||||||
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
|
||||||
### Fixed
|
|
||||||
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
|
||||||
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
|
||||||
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
|
||||||
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
|
||||||
|
|
||||||
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
|
||||||
### Changed
|
|
||||||
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
|
||||||
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
|
||||||
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
|
||||||
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
|
||||||
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
|
||||||
- utf_7 detection has been reinstated.
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
|
||||||
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
|
||||||
- The exception hook on UnicodeDecodeError has been removed.
|
|
||||||
|
|
||||||
### Deprecated
|
|
||||||
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- The CLI output used the relative path of the file(s). Should be absolute.
|
|
||||||
|
|
||||||
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
|
||||||
### Fixed
|
|
||||||
- Logger configuration/usage no longer conflict with others (PR #44)
|
|
||||||
|
|
||||||
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
|
||||||
### Removed
|
|
||||||
- Using standard logging instead of using the package loguru.
|
|
||||||
- Dropping nose test framework in favor of the maintained pytest.
|
|
||||||
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
|
||||||
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
|
||||||
- Stop support for UTF-7 that does not contain a SIG.
|
|
||||||
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
|
||||||
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Improving the package final size by compressing frequencies.json.
|
|
||||||
- Huge improvement over the larges payload.
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- CLI now produces JSON consumable output.
|
|
||||||
- Return ASCII if given sequences fit. Given reasonable confidence.
|
|
||||||
|
|
||||||
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
|
||||||
|
|
||||||
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
|
||||||
|
|
||||||
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
|
||||||
|
|
||||||
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
|
||||||
|
|
||||||
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Dependencies refactoring, constraints revised.
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Add python 3.9 and 3.10 to the supported interpreters
|
|
||||||
|
|
||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2025 TAHRI Ahmed R.
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
../../bin/normalizer,sha256=9v9CiM1SMj9hAM5YRMeRilB5D2s9HWSQ_qInrU2KpQU,253
|
|
||||||
charset_normalizer-3.4.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
|
||||||
charset_normalizer-3.4.4.dist-info/METADATA,sha256=jVuUFBti8dav19YLvWissTihVdF2ozUY4KKMw7jdkBQ,37303
|
|
||||||
charset_normalizer-3.4.4.dist-info/RECORD,,
|
|
||||||
charset_normalizer-3.4.4.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
charset_normalizer-3.4.4.dist-info/WHEEL,sha256=ylJURW79exB2C-59Cgd1o0-VkgYeH2GaNb8U43WRbwg,187
|
|
||||||
charset_normalizer-3.4.4.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
|
||||||
charset_normalizer-3.4.4.dist-info/licenses/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
|
|
||||||
charset_normalizer-3.4.4.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
|
||||||
charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
|
|
||||||
charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
|
|
||||||
charset_normalizer/api.py,sha256=V07i8aVeCD8T2fSia3C-fn0i9t8qQguEBhsqszg32Ns,22668
|
|
||||||
charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
|
|
||||||
charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
|
|
||||||
charset_normalizer/cli/__main__.py,sha256=dMaXG6IJXRvqq8z2tig7Qb83-BpWTln55ooiku5_uvg,12646
|
|
||||||
charset_normalizer/constant.py,sha256=7UVY4ldYhmQMHUdgQ_sgZmzcQ0xxYxpBunqSZ-XJZ8U,42713
|
|
||||||
charset_normalizer/legacy.py,sha256=sYBzSpzsRrg_wF4LP536pG64BItw7Tqtc3SMQAHvFLM,2731
|
|
||||||
charset_normalizer/md.cpython-39-aarch64-linux-gnu.so,sha256=Fl5b5yFpdjEkjaOr3bUxLjAmBYD37YTHfMCVs6SBh6A,201304
|
|
||||||
charset_normalizer/md.py,sha256=-_oN3h3_X99nkFfqamD3yu45DC_wfk5odH0Tr_CQiXs,20145
|
|
||||||
charset_normalizer/md__mypyc.cpython-39-aarch64-linux-gnu.so,sha256=nAx9ZddHDN7KxsVUxkP8Qen7djjHdO1_0_bP_aqC34A,324120
|
|
||||||
charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
|
|
||||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
charset_normalizer/utils.py,sha256=sTejPgrdlNsKNucZfJCxJ95lMTLA0ShHLLE3n5wpT9Q,12170
|
|
||||||
charset_normalizer/version.py,sha256=nKE4qBNk5WA4LIJ_yIH_aSDfvtsyizkWMg-PUG-UZVk,115
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
Wheel-Version: 1.0
|
|
||||||
Generator: setuptools (80.9.0)
|
|
||||||
Root-Is-Purelib: false
|
|
||||||
Tag: cp39-cp39-manylinux_2_17_aarch64
|
|
||||||
Tag: cp39-cp39-manylinux2014_aarch64
|
|
||||||
Tag: cp39-cp39-manylinux_2_28_aarch64
|
|
||||||
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2025 TAHRI Ahmed R.
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from .cli import cli_detect
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli_detect()
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,669 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from os import PathLike
|
|
||||||
from typing import BinaryIO
|
|
||||||
|
|
||||||
from .cd import (
|
|
||||||
coherence_ratio,
|
|
||||||
encoding_languages,
|
|
||||||
mb_encoding_languages,
|
|
||||||
merge_coherence_ratios,
|
|
||||||
)
|
|
||||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
|
||||||
from .md import mess_ratio
|
|
||||||
from .models import CharsetMatch, CharsetMatches
|
|
||||||
from .utils import (
|
|
||||||
any_specified_encoding,
|
|
||||||
cut_sequence_chunks,
|
|
||||||
iana_name,
|
|
||||||
identify_sig_or_bom,
|
|
||||||
is_cp_similar,
|
|
||||||
is_multi_byte_encoding,
|
|
||||||
should_strip_sig_or_bom,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger("charset_normalizer")
|
|
||||||
explain_handler = logging.StreamHandler()
|
|
||||||
explain_handler.setFormatter(
|
|
||||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def from_bytes(
|
|
||||||
sequences: bytes | bytearray,
|
|
||||||
steps: int = 5,
|
|
||||||
chunk_size: int = 512,
|
|
||||||
threshold: float = 0.2,
|
|
||||||
cp_isolation: list[str] | None = None,
|
|
||||||
cp_exclusion: list[str] | None = None,
|
|
||||||
preemptive_behaviour: bool = True,
|
|
||||||
explain: bool = False,
|
|
||||||
language_threshold: float = 0.1,
|
|
||||||
enable_fallback: bool = True,
|
|
||||||
) -> CharsetMatches:
|
|
||||||
"""
|
|
||||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
|
||||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
|
||||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
|
||||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
|
||||||
|
|
||||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
|
||||||
but never take it for granted. Can improve the performance.
|
|
||||||
|
|
||||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
|
||||||
purpose.
|
|
||||||
|
|
||||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
|
||||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
|
||||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
|
||||||
Custom logging format and handler can be set manually.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not isinstance(sequences, (bytearray, bytes)):
|
|
||||||
raise TypeError(
|
|
||||||
"Expected object of type bytes or bytearray, got: {}".format(
|
|
||||||
type(sequences)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if explain:
|
|
||||||
previous_logger_level: int = logger.level
|
|
||||||
logger.addHandler(explain_handler)
|
|
||||||
logger.setLevel(TRACE)
|
|
||||||
|
|
||||||
length: int = len(sequences)
|
|
||||||
|
|
||||||
if length == 0:
|
|
||||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
|
||||||
if explain: # Defensive: ensure exit path clean handler
|
|
||||||
logger.removeHandler(explain_handler)
|
|
||||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
|
||||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
|
||||||
|
|
||||||
if cp_isolation is not None:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"cp_isolation is set. use this flag for debugging purpose. "
|
|
||||||
"limited list of encoding allowed : %s.",
|
|
||||||
", ".join(cp_isolation),
|
|
||||||
)
|
|
||||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
|
||||||
else:
|
|
||||||
cp_isolation = []
|
|
||||||
|
|
||||||
if cp_exclusion is not None:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
|
||||||
"limited list of encoding excluded : %s.",
|
|
||||||
", ".join(cp_exclusion),
|
|
||||||
)
|
|
||||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
|
||||||
else:
|
|
||||||
cp_exclusion = []
|
|
||||||
|
|
||||||
if length <= (chunk_size * steps):
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
|
||||||
steps,
|
|
||||||
chunk_size,
|
|
||||||
length,
|
|
||||||
)
|
|
||||||
steps = 1
|
|
||||||
chunk_size = length
|
|
||||||
|
|
||||||
if steps > 1 and length / steps < chunk_size:
|
|
||||||
chunk_size = int(length / steps)
|
|
||||||
|
|
||||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
|
||||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
|
||||||
|
|
||||||
if is_too_small_sequence:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
|
||||||
length
|
|
||||||
),
|
|
||||||
)
|
|
||||||
elif is_too_large_sequence:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
|
||||||
length
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
prioritized_encodings: list[str] = []
|
|
||||||
|
|
||||||
specified_encoding: str | None = (
|
|
||||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
|
||||||
)
|
|
||||||
|
|
||||||
if specified_encoding is not None:
|
|
||||||
prioritized_encodings.append(specified_encoding)
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
|
||||||
specified_encoding,
|
|
||||||
)
|
|
||||||
|
|
||||||
tested: set[str] = set()
|
|
||||||
tested_but_hard_failure: list[str] = []
|
|
||||||
tested_but_soft_failure: list[str] = []
|
|
||||||
|
|
||||||
fallback_ascii: CharsetMatch | None = None
|
|
||||||
fallback_u8: CharsetMatch | None = None
|
|
||||||
fallback_specified: CharsetMatch | None = None
|
|
||||||
|
|
||||||
results: CharsetMatches = CharsetMatches()
|
|
||||||
|
|
||||||
early_stop_results: CharsetMatches = CharsetMatches()
|
|
||||||
|
|
||||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
|
||||||
|
|
||||||
if sig_encoding is not None:
|
|
||||||
prioritized_encodings.append(sig_encoding)
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
|
||||||
len(sig_payload),
|
|
||||||
sig_encoding,
|
|
||||||
)
|
|
||||||
|
|
||||||
prioritized_encodings.append("ascii")
|
|
||||||
|
|
||||||
if "utf_8" not in prioritized_encodings:
|
|
||||||
prioritized_encodings.append("utf_8")
|
|
||||||
|
|
||||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
|
||||||
if cp_isolation and encoding_iana not in cp_isolation:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if encoding_iana in tested:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tested.add(encoding_iana)
|
|
||||||
|
|
||||||
decoded_payload: str | None = None
|
|
||||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
|
||||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
|
||||||
encoding_iana
|
|
||||||
)
|
|
||||||
|
|
||||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
|
||||||
encoding_iana,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
|
||||||
encoding_iana,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
|
||||||
except (ModuleNotFoundError, ImportError):
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Encoding %s does not provide an IncrementalDecoder",
|
|
||||||
encoding_iana,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
|
||||||
str(
|
|
||||||
(
|
|
||||||
sequences[: int(50e4)]
|
|
||||||
if strip_sig_or_bom is False
|
|
||||||
else sequences[len(sig_payload) : int(50e4)]
|
|
||||||
),
|
|
||||||
encoding=encoding_iana,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
decoded_payload = str(
|
|
||||||
(
|
|
||||||
sequences
|
|
||||||
if strip_sig_or_bom is False
|
|
||||||
else sequences[len(sig_payload) :]
|
|
||||||
),
|
|
||||||
encoding=encoding_iana,
|
|
||||||
)
|
|
||||||
except (UnicodeDecodeError, LookupError) as e:
|
|
||||||
if not isinstance(e, LookupError):
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
|
||||||
encoding_iana,
|
|
||||||
str(e),
|
|
||||||
)
|
|
||||||
tested_but_hard_failure.append(encoding_iana)
|
|
||||||
continue
|
|
||||||
|
|
||||||
similar_soft_failure_test: bool = False
|
|
||||||
|
|
||||||
for encoding_soft_failed in tested_but_soft_failure:
|
|
||||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
|
||||||
similar_soft_failure_test = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if similar_soft_failure_test:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
|
||||||
encoding_iana,
|
|
||||||
encoding_soft_failed,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
r_ = range(
|
|
||||||
0 if not bom_or_sig_available else len(sig_payload),
|
|
||||||
length,
|
|
||||||
int(length / steps),
|
|
||||||
)
|
|
||||||
|
|
||||||
multi_byte_bonus: bool = (
|
|
||||||
is_multi_byte_decoder
|
|
||||||
and decoded_payload is not None
|
|
||||||
and len(decoded_payload) < length
|
|
||||||
)
|
|
||||||
|
|
||||||
if multi_byte_bonus:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
|
||||||
"was encoded using n-bytes.",
|
|
||||||
encoding_iana,
|
|
||||||
)
|
|
||||||
|
|
||||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
|
||||||
|
|
||||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
|
||||||
early_stop_count: int = 0
|
|
||||||
lazy_str_hard_failure = False
|
|
||||||
|
|
||||||
md_chunks: list[str] = []
|
|
||||||
md_ratios = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
for chunk in cut_sequence_chunks(
|
|
||||||
sequences,
|
|
||||||
encoding_iana,
|
|
||||||
r_,
|
|
||||||
chunk_size,
|
|
||||||
bom_or_sig_available,
|
|
||||||
strip_sig_or_bom,
|
|
||||||
sig_payload,
|
|
||||||
is_multi_byte_decoder,
|
|
||||||
decoded_payload,
|
|
||||||
):
|
|
||||||
md_chunks.append(chunk)
|
|
||||||
|
|
||||||
md_ratios.append(
|
|
||||||
mess_ratio(
|
|
||||||
chunk,
|
|
||||||
threshold,
|
|
||||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if md_ratios[-1] >= threshold:
|
|
||||||
early_stop_count += 1
|
|
||||||
|
|
||||||
if (early_stop_count >= max_chunk_gave_up) or (
|
|
||||||
bom_or_sig_available and strip_sig_or_bom is False
|
|
||||||
):
|
|
||||||
break
|
|
||||||
except (
|
|
||||||
UnicodeDecodeError
|
|
||||||
) as e: # Lazy str loading may have missed something there
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
|
||||||
encoding_iana,
|
|
||||||
str(e),
|
|
||||||
)
|
|
||||||
early_stop_count = max_chunk_gave_up
|
|
||||||
lazy_str_hard_failure = True
|
|
||||||
|
|
||||||
# We might want to check the sequence again with the whole content
|
|
||||||
# Only if initial MD tests passes
|
|
||||||
if (
|
|
||||||
not lazy_str_hard_failure
|
|
||||||
and is_too_large_sequence
|
|
||||||
and not is_multi_byte_decoder
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
|
||||||
except UnicodeDecodeError as e:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
|
||||||
encoding_iana,
|
|
||||||
str(e),
|
|
||||||
)
|
|
||||||
tested_but_hard_failure.append(encoding_iana)
|
|
||||||
continue
|
|
||||||
|
|
||||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
|
||||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
|
||||||
tested_but_soft_failure.append(encoding_iana)
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
|
||||||
"Computed mean chaos is %f %%.",
|
|
||||||
encoding_iana,
|
|
||||||
early_stop_count,
|
|
||||||
round(mean_mess_ratio * 100, ndigits=3),
|
|
||||||
)
|
|
||||||
# Preparing those fallbacks in case we got nothing.
|
|
||||||
if (
|
|
||||||
enable_fallback
|
|
||||||
and encoding_iana
|
|
||||||
in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
|
|
||||||
and not lazy_str_hard_failure
|
|
||||||
):
|
|
||||||
fallback_entry = CharsetMatch(
|
|
||||||
sequences,
|
|
||||||
encoding_iana,
|
|
||||||
threshold,
|
|
||||||
bom_or_sig_available,
|
|
||||||
[],
|
|
||||||
decoded_payload,
|
|
||||||
preemptive_declaration=specified_encoding,
|
|
||||||
)
|
|
||||||
if encoding_iana == specified_encoding:
|
|
||||||
fallback_specified = fallback_entry
|
|
||||||
elif encoding_iana == "ascii":
|
|
||||||
fallback_ascii = fallback_entry
|
|
||||||
else:
|
|
||||||
fallback_u8 = fallback_entry
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
|
||||||
encoding_iana,
|
|
||||||
round(mean_mess_ratio * 100, ndigits=3),
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_multi_byte_decoder:
|
|
||||||
target_languages: list[str] = encoding_languages(encoding_iana)
|
|
||||||
else:
|
|
||||||
target_languages = mb_encoding_languages(encoding_iana)
|
|
||||||
|
|
||||||
if target_languages:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"{} should target any language(s) of {}".format(
|
|
||||||
encoding_iana, str(target_languages)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
cd_ratios = []
|
|
||||||
|
|
||||||
# We shall skip the CD when its about ASCII
|
|
||||||
# Most of the time its not relevant to run "language-detection" on it.
|
|
||||||
if encoding_iana != "ascii":
|
|
||||||
for chunk in md_chunks:
|
|
||||||
chunk_languages = coherence_ratio(
|
|
||||||
chunk,
|
|
||||||
language_threshold,
|
|
||||||
",".join(target_languages) if target_languages else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
cd_ratios.append(chunk_languages)
|
|
||||||
|
|
||||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
|
||||||
|
|
||||||
if cd_ratios_merged:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"We detected language {} using {}".format(
|
|
||||||
cd_ratios_merged, encoding_iana
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
current_match = CharsetMatch(
|
|
||||||
sequences,
|
|
||||||
encoding_iana,
|
|
||||||
mean_mess_ratio,
|
|
||||||
bom_or_sig_available,
|
|
||||||
cd_ratios_merged,
|
|
||||||
(
|
|
||||||
decoded_payload
|
|
||||||
if (
|
|
||||||
is_too_large_sequence is False
|
|
||||||
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
|
||||||
)
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
preemptive_declaration=specified_encoding,
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append(current_match)
|
|
||||||
|
|
||||||
if (
|
|
||||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
|
||||||
and mean_mess_ratio < 0.1
|
|
||||||
):
|
|
||||||
# If md says nothing to worry about, then... stop immediately!
|
|
||||||
if mean_mess_ratio == 0.0:
|
|
||||||
logger.debug(
|
|
||||||
"Encoding detection: %s is most likely the one.",
|
|
||||||
current_match.encoding,
|
|
||||||
)
|
|
||||||
if explain: # Defensive: ensure exit path clean handler
|
|
||||||
logger.removeHandler(explain_handler)
|
|
||||||
logger.setLevel(previous_logger_level)
|
|
||||||
return CharsetMatches([current_match])
|
|
||||||
|
|
||||||
early_stop_results.append(current_match)
|
|
||||||
|
|
||||||
if (
|
|
||||||
len(early_stop_results)
|
|
||||||
and (specified_encoding is None or specified_encoding in tested)
|
|
||||||
and "ascii" in tested
|
|
||||||
and "utf_8" in tested
|
|
||||||
):
|
|
||||||
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
|
||||||
logger.debug(
|
|
||||||
"Encoding detection: %s is most likely the one.",
|
|
||||||
probable_result.encoding,
|
|
||||||
)
|
|
||||||
if explain: # Defensive: ensure exit path clean handler
|
|
||||||
logger.removeHandler(explain_handler)
|
|
||||||
logger.setLevel(previous_logger_level)
|
|
||||||
|
|
||||||
return CharsetMatches([probable_result])
|
|
||||||
|
|
||||||
if encoding_iana == sig_encoding:
|
|
||||||
logger.debug(
|
|
||||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
|
||||||
"the beginning of the sequence.",
|
|
||||||
encoding_iana,
|
|
||||||
)
|
|
||||||
if explain: # Defensive: ensure exit path clean handler
|
|
||||||
logger.removeHandler(explain_handler)
|
|
||||||
logger.setLevel(previous_logger_level)
|
|
||||||
return CharsetMatches([results[encoding_iana]])
|
|
||||||
|
|
||||||
if len(results) == 0:
|
|
||||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
|
||||||
)
|
|
||||||
|
|
||||||
if fallback_specified:
|
|
||||||
logger.debug(
|
|
||||||
"Encoding detection: %s will be used as a fallback match",
|
|
||||||
fallback_specified.encoding,
|
|
||||||
)
|
|
||||||
results.append(fallback_specified)
|
|
||||||
elif (
|
|
||||||
(fallback_u8 and fallback_ascii is None)
|
|
||||||
or (
|
|
||||||
fallback_u8
|
|
||||||
and fallback_ascii
|
|
||||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
|
||||||
)
|
|
||||||
or (fallback_u8 is not None)
|
|
||||||
):
|
|
||||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
|
||||||
results.append(fallback_u8)
|
|
||||||
elif fallback_ascii:
|
|
||||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
|
||||||
results.append(fallback_ascii)
|
|
||||||
|
|
||||||
if results:
|
|
||||||
logger.debug(
|
|
||||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
|
||||||
results.best().encoding, # type: ignore
|
|
||||||
len(results) - 1,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
|
||||||
|
|
||||||
if explain:
|
|
||||||
logger.removeHandler(explain_handler)
|
|
||||||
logger.setLevel(previous_logger_level)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def from_fp(
|
|
||||||
fp: BinaryIO,
|
|
||||||
steps: int = 5,
|
|
||||||
chunk_size: int = 512,
|
|
||||||
threshold: float = 0.20,
|
|
||||||
cp_isolation: list[str] | None = None,
|
|
||||||
cp_exclusion: list[str] | None = None,
|
|
||||||
preemptive_behaviour: bool = True,
|
|
||||||
explain: bool = False,
|
|
||||||
language_threshold: float = 0.1,
|
|
||||||
enable_fallback: bool = True,
|
|
||||||
) -> CharsetMatches:
|
|
||||||
"""
|
|
||||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
|
||||||
Will not close the file pointer.
|
|
||||||
"""
|
|
||||||
return from_bytes(
|
|
||||||
fp.read(),
|
|
||||||
steps,
|
|
||||||
chunk_size,
|
|
||||||
threshold,
|
|
||||||
cp_isolation,
|
|
||||||
cp_exclusion,
|
|
||||||
preemptive_behaviour,
|
|
||||||
explain,
|
|
||||||
language_threshold,
|
|
||||||
enable_fallback,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def from_path(
|
|
||||||
path: str | bytes | PathLike, # type: ignore[type-arg]
|
|
||||||
steps: int = 5,
|
|
||||||
chunk_size: int = 512,
|
|
||||||
threshold: float = 0.20,
|
|
||||||
cp_isolation: list[str] | None = None,
|
|
||||||
cp_exclusion: list[str] | None = None,
|
|
||||||
preemptive_behaviour: bool = True,
|
|
||||||
explain: bool = False,
|
|
||||||
language_threshold: float = 0.1,
|
|
||||||
enable_fallback: bool = True,
|
|
||||||
) -> CharsetMatches:
|
|
||||||
"""
|
|
||||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
|
||||||
Can raise IOError.
|
|
||||||
"""
|
|
||||||
with open(path, "rb") as fp:
|
|
||||||
return from_fp(
|
|
||||||
fp,
|
|
||||||
steps,
|
|
||||||
chunk_size,
|
|
||||||
threshold,
|
|
||||||
cp_isolation,
|
|
||||||
cp_exclusion,
|
|
||||||
preemptive_behaviour,
|
|
||||||
explain,
|
|
||||||
language_threshold,
|
|
||||||
enable_fallback,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def is_binary(
|
|
||||||
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
|
||||||
steps: int = 5,
|
|
||||||
chunk_size: int = 512,
|
|
||||||
threshold: float = 0.20,
|
|
||||||
cp_isolation: list[str] | None = None,
|
|
||||||
cp_exclusion: list[str] | None = None,
|
|
||||||
preemptive_behaviour: bool = True,
|
|
||||||
explain: bool = False,
|
|
||||||
language_threshold: float = 0.1,
|
|
||||||
enable_fallback: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
|
||||||
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
|
||||||
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
|
||||||
"""
|
|
||||||
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
|
||||||
guesses = from_path(
|
|
||||||
fp_or_path_or_payload,
|
|
||||||
steps=steps,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=threshold,
|
|
||||||
cp_isolation=cp_isolation,
|
|
||||||
cp_exclusion=cp_exclusion,
|
|
||||||
preemptive_behaviour=preemptive_behaviour,
|
|
||||||
explain=explain,
|
|
||||||
language_threshold=language_threshold,
|
|
||||||
enable_fallback=enable_fallback,
|
|
||||||
)
|
|
||||||
elif isinstance(
|
|
||||||
fp_or_path_or_payload,
|
|
||||||
(
|
|
||||||
bytes,
|
|
||||||
bytearray,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
guesses = from_bytes(
|
|
||||||
fp_or_path_or_payload,
|
|
||||||
steps=steps,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=threshold,
|
|
||||||
cp_isolation=cp_isolation,
|
|
||||||
cp_exclusion=cp_exclusion,
|
|
||||||
preemptive_behaviour=preemptive_behaviour,
|
|
||||||
explain=explain,
|
|
||||||
language_threshold=language_threshold,
|
|
||||||
enable_fallback=enable_fallback,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
guesses = from_fp(
|
|
||||||
fp_or_path_or_payload,
|
|
||||||
steps=steps,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=threshold,
|
|
||||||
cp_isolation=cp_isolation,
|
|
||||||
cp_exclusion=cp_exclusion,
|
|
||||||
preemptive_behaviour=preemptive_behaviour,
|
|
||||||
explain=explain,
|
|
||||||
language_threshold=language_threshold,
|
|
||||||
enable_fallback=enable_fallback,
|
|
||||||
)
|
|
||||||
|
|
||||||
return not guesses
|
|
||||||
@ -1,395 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import importlib
|
|
||||||
from codecs import IncrementalDecoder
|
|
||||||
from collections import Counter
|
|
||||||
from functools import lru_cache
|
|
||||||
from typing import Counter as TypeCounter
|
|
||||||
|
|
||||||
from .constant import (
|
|
||||||
FREQUENCIES,
|
|
||||||
KO_NAMES,
|
|
||||||
LANGUAGE_SUPPORTED_COUNT,
|
|
||||||
TOO_SMALL_SEQUENCE,
|
|
||||||
ZH_NAMES,
|
|
||||||
)
|
|
||||||
from .md import is_suspiciously_successive_range
|
|
||||||
from .models import CoherenceMatches
|
|
||||||
from .utils import (
|
|
||||||
is_accentuated,
|
|
||||||
is_latin,
|
|
||||||
is_multi_byte_encoding,
|
|
||||||
is_unicode_range_secondary,
|
|
||||||
unicode_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def encoding_unicode_range(iana_name: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return associated unicode ranges in a single byte code page.
|
|
||||||
"""
|
|
||||||
if is_multi_byte_encoding(iana_name):
|
|
||||||
raise OSError("Function not supported on multi-byte code page")
|
|
||||||
|
|
||||||
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
|
||||||
|
|
||||||
p: IncrementalDecoder = decoder(errors="ignore")
|
|
||||||
seen_ranges: dict[str, int] = {}
|
|
||||||
character_count: int = 0
|
|
||||||
|
|
||||||
for i in range(0x40, 0xFF):
|
|
||||||
chunk: str = p.decode(bytes([i]))
|
|
||||||
|
|
||||||
if chunk:
|
|
||||||
character_range: str | None = unicode_range(chunk)
|
|
||||||
|
|
||||||
if character_range is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if is_unicode_range_secondary(character_range) is False:
|
|
||||||
if character_range not in seen_ranges:
|
|
||||||
seen_ranges[character_range] = 0
|
|
||||||
seen_ranges[character_range] += 1
|
|
||||||
character_count += 1
|
|
||||||
|
|
||||||
return sorted(
|
|
||||||
[
|
|
||||||
character_range
|
|
||||||
for character_range in seen_ranges
|
|
||||||
if seen_ranges[character_range] / character_count >= 0.15
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def unicode_range_languages(primary_range: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return inferred languages used with a unicode range.
|
|
||||||
"""
|
|
||||||
languages: list[str] = []
|
|
||||||
|
|
||||||
for language, characters in FREQUENCIES.items():
|
|
||||||
for character in characters:
|
|
||||||
if unicode_range(character) == primary_range:
|
|
||||||
languages.append(language)
|
|
||||||
break
|
|
||||||
|
|
||||||
return languages
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
|
||||||
def encoding_languages(iana_name: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
|
||||||
This function does the correspondence.
|
|
||||||
"""
|
|
||||||
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
|
||||||
primary_range: str | None = None
|
|
||||||
|
|
||||||
for specified_range in unicode_ranges:
|
|
||||||
if "Latin" not in specified_range:
|
|
||||||
primary_range = specified_range
|
|
||||||
break
|
|
||||||
|
|
||||||
if primary_range is None:
|
|
||||||
return ["Latin Based"]
|
|
||||||
|
|
||||||
return unicode_range_languages(primary_range)
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
|
||||||
def mb_encoding_languages(iana_name: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
|
||||||
This function does the correspondence.
|
|
||||||
"""
|
|
||||||
if (
|
|
||||||
iana_name.startswith("shift_")
|
|
||||||
or iana_name.startswith("iso2022_jp")
|
|
||||||
or iana_name.startswith("euc_j")
|
|
||||||
or iana_name == "cp932"
|
|
||||||
):
|
|
||||||
return ["Japanese"]
|
|
||||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
|
||||||
return ["Chinese"]
|
|
||||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
|
||||||
return ["Korean"]
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
|
||||||
def get_target_features(language: str) -> tuple[bool, bool]:
|
|
||||||
"""
|
|
||||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
|
||||||
"""
|
|
||||||
target_have_accents: bool = False
|
|
||||||
target_pure_latin: bool = True
|
|
||||||
|
|
||||||
for character in FREQUENCIES[language]:
|
|
||||||
if not target_have_accents and is_accentuated(character):
|
|
||||||
target_have_accents = True
|
|
||||||
if target_pure_latin and is_latin(character) is False:
|
|
||||||
target_pure_latin = False
|
|
||||||
|
|
||||||
return target_have_accents, target_pure_latin
|
|
||||||
|
|
||||||
|
|
||||||
def alphabet_languages(
|
|
||||||
characters: list[str], ignore_non_latin: bool = False
|
|
||||||
) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return associated languages associated to given characters.
|
|
||||||
"""
|
|
||||||
languages: list[tuple[str, float]] = []
|
|
||||||
|
|
||||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
|
||||||
|
|
||||||
for language, language_characters in FREQUENCIES.items():
|
|
||||||
target_have_accents, target_pure_latin = get_target_features(language)
|
|
||||||
|
|
||||||
if ignore_non_latin and target_pure_latin is False:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if target_have_accents is False and source_have_accents:
|
|
||||||
continue
|
|
||||||
|
|
||||||
character_count: int = len(language_characters)
|
|
||||||
|
|
||||||
character_match_count: int = len(
|
|
||||||
[c for c in language_characters if c in characters]
|
|
||||||
)
|
|
||||||
|
|
||||||
ratio: float = character_match_count / character_count
|
|
||||||
|
|
||||||
if ratio >= 0.2:
|
|
||||||
languages.append((language, ratio))
|
|
||||||
|
|
||||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
|
||||||
|
|
||||||
return [compatible_language[0] for compatible_language in languages]
|
|
||||||
|
|
||||||
|
|
||||||
def characters_popularity_compare(
|
|
||||||
language: str, ordered_characters: list[str]
|
|
||||||
) -> float:
|
|
||||||
"""
|
|
||||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
|
||||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
|
||||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
|
||||||
"""
|
|
||||||
if language not in FREQUENCIES:
|
|
||||||
raise ValueError(f"{language} not available")
|
|
||||||
|
|
||||||
character_approved_count: int = 0
|
|
||||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
|
||||||
|
|
||||||
ordered_characters_count: int = len(ordered_characters)
|
|
||||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
|
||||||
|
|
||||||
large_alphabet: bool = target_language_characters_count > 26
|
|
||||||
|
|
||||||
for character, character_rank in zip(
|
|
||||||
ordered_characters, range(0, ordered_characters_count)
|
|
||||||
):
|
|
||||||
if character not in FREQUENCIES_language_set:
|
|
||||||
continue
|
|
||||||
|
|
||||||
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
|
||||||
expected_projection_ratio: float = (
|
|
||||||
target_language_characters_count / ordered_characters_count
|
|
||||||
)
|
|
||||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
|
||||||
|
|
||||||
if (
|
|
||||||
large_alphabet is False
|
|
||||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if (
|
|
||||||
large_alphabet is True
|
|
||||||
and abs(character_rank_projection - character_rank_in_language)
|
|
||||||
< target_language_characters_count / 3
|
|
||||||
):
|
|
||||||
character_approved_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
characters_before_source: list[str] = FREQUENCIES[language][
|
|
||||||
0:character_rank_in_language
|
|
||||||
]
|
|
||||||
characters_after_source: list[str] = FREQUENCIES[language][
|
|
||||||
character_rank_in_language:
|
|
||||||
]
|
|
||||||
characters_before: list[str] = ordered_characters[0:character_rank]
|
|
||||||
characters_after: list[str] = ordered_characters[character_rank:]
|
|
||||||
|
|
||||||
before_match_count: int = len(
|
|
||||||
set(characters_before) & set(characters_before_source)
|
|
||||||
)
|
|
||||||
|
|
||||||
after_match_count: int = len(
|
|
||||||
set(characters_after) & set(characters_after_source)
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
|
||||||
character_approved_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
|
||||||
character_approved_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if (
|
|
||||||
before_match_count / len(characters_before_source) >= 0.4
|
|
||||||
or after_match_count / len(characters_after_source) >= 0.4
|
|
||||||
):
|
|
||||||
character_approved_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
return character_approved_count / len(ordered_characters)
|
|
||||||
|
|
||||||
|
|
||||||
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
|
||||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
|
||||||
One containing the latin letters and the other hebrew.
|
|
||||||
"""
|
|
||||||
layers: dict[str, str] = {}
|
|
||||||
|
|
||||||
for character in decoded_sequence:
|
|
||||||
if character.isalpha() is False:
|
|
||||||
continue
|
|
||||||
|
|
||||||
character_range: str | None = unicode_range(character)
|
|
||||||
|
|
||||||
if character_range is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
layer_target_range: str | None = None
|
|
||||||
|
|
||||||
for discovered_range in layers:
|
|
||||||
if (
|
|
||||||
is_suspiciously_successive_range(discovered_range, character_range)
|
|
||||||
is False
|
|
||||||
):
|
|
||||||
layer_target_range = discovered_range
|
|
||||||
break
|
|
||||||
|
|
||||||
if layer_target_range is None:
|
|
||||||
layer_target_range = character_range
|
|
||||||
|
|
||||||
if layer_target_range not in layers:
|
|
||||||
layers[layer_target_range] = character.lower()
|
|
||||||
continue
|
|
||||||
|
|
||||||
layers[layer_target_range] += character.lower()
|
|
||||||
|
|
||||||
return list(layers.values())
|
|
||||||
|
|
||||||
|
|
||||||
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
|
||||||
"""
|
|
||||||
This function merge results previously given by the function coherence_ratio.
|
|
||||||
The return type is the same as coherence_ratio.
|
|
||||||
"""
|
|
||||||
per_language_ratios: dict[str, list[float]] = {}
|
|
||||||
for result in results:
|
|
||||||
for sub_result in result:
|
|
||||||
language, ratio = sub_result
|
|
||||||
if language not in per_language_ratios:
|
|
||||||
per_language_ratios[language] = [ratio]
|
|
||||||
continue
|
|
||||||
per_language_ratios[language].append(ratio)
|
|
||||||
|
|
||||||
merge = [
|
|
||||||
(
|
|
||||||
language,
|
|
||||||
round(
|
|
||||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
|
||||||
4,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
for language in per_language_ratios
|
|
||||||
]
|
|
||||||
|
|
||||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
|
||||||
|
|
||||||
|
|
||||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
|
||||||
"""
|
|
||||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
|
||||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
|
||||||
"""
|
|
||||||
index_results: dict[str, list[float]] = dict()
|
|
||||||
|
|
||||||
for result in results:
|
|
||||||
language, ratio = result
|
|
||||||
no_em_name: str = language.replace("—", "")
|
|
||||||
|
|
||||||
if no_em_name not in index_results:
|
|
||||||
index_results[no_em_name] = []
|
|
||||||
|
|
||||||
index_results[no_em_name].append(ratio)
|
|
||||||
|
|
||||||
if any(len(index_results[e]) > 1 for e in index_results):
|
|
||||||
filtered_results: CoherenceMatches = []
|
|
||||||
|
|
||||||
for language in index_results:
|
|
||||||
filtered_results.append((language, max(index_results[language])))
|
|
||||||
|
|
||||||
return filtered_results
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=2048)
|
|
||||||
def coherence_ratio(
|
|
||||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
|
||||||
) -> CoherenceMatches:
|
|
||||||
"""
|
|
||||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
|
||||||
A layer = Character extraction by alphabets/ranges.
|
|
||||||
"""
|
|
||||||
|
|
||||||
results: list[tuple[str, float]] = []
|
|
||||||
ignore_non_latin: bool = False
|
|
||||||
|
|
||||||
sufficient_match_count: int = 0
|
|
||||||
|
|
||||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
|
||||||
if "Latin Based" in lg_inclusion_list:
|
|
||||||
ignore_non_latin = True
|
|
||||||
lg_inclusion_list.remove("Latin Based")
|
|
||||||
|
|
||||||
for layer in alpha_unicode_split(decoded_sequence):
|
|
||||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
|
||||||
most_common = sequence_frequencies.most_common()
|
|
||||||
|
|
||||||
character_count: int = sum(o for c, o in most_common)
|
|
||||||
|
|
||||||
if character_count <= TOO_SMALL_SEQUENCE:
|
|
||||||
continue
|
|
||||||
|
|
||||||
popular_character_ordered: list[str] = [c for c, o in most_common]
|
|
||||||
|
|
||||||
for language in lg_inclusion_list or alphabet_languages(
|
|
||||||
popular_character_ordered, ignore_non_latin
|
|
||||||
):
|
|
||||||
ratio: float = characters_popularity_compare(
|
|
||||||
language, popular_character_ordered
|
|
||||||
)
|
|
||||||
|
|
||||||
if ratio < threshold:
|
|
||||||
continue
|
|
||||||
elif ratio >= 0.8:
|
|
||||||
sufficient_match_count += 1
|
|
||||||
|
|
||||||
results.append((language, round(ratio, 4)))
|
|
||||||
|
|
||||||
if sufficient_match_count >= 3:
|
|
||||||
break
|
|
||||||
|
|
||||||
return sorted(
|
|
||||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
|
||||||
)
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from .__main__ import cli_detect, query_yes_no
|
|
||||||
|
|
||||||
__all__ = (
|
|
||||||
"cli_detect",
|
|
||||||
"query_yes_no",
|
|
||||||
)
|
|
||||||
@ -1,381 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import typing
|
|
||||||
from json import dumps
|
|
||||||
from os.path import abspath, basename, dirname, join, realpath
|
|
||||||
from platform import python_version
|
|
||||||
from unicodedata import unidata_version
|
|
||||||
|
|
||||||
import charset_normalizer.md as md_module
|
|
||||||
from charset_normalizer import from_fp
|
|
||||||
from charset_normalizer.models import CliDetectionResult
|
|
||||||
from charset_normalizer.version import __version__
|
|
||||||
|
|
||||||
|
|
||||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
|
||||||
"""Ask a yes/no question via input() and return their answer.
|
|
||||||
|
|
||||||
"question" is a string that is presented to the user.
|
|
||||||
"default" is the presumed answer if the user just hits <Enter>.
|
|
||||||
It must be "yes" (the default), "no" or None (meaning
|
|
||||||
an answer is required of the user).
|
|
||||||
|
|
||||||
The "answer" return value is True for "yes" or False for "no".
|
|
||||||
|
|
||||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
|
||||||
"""
|
|
||||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
|
||||||
if default is None:
|
|
||||||
prompt = " [y/n] "
|
|
||||||
elif default == "yes":
|
|
||||||
prompt = " [Y/n] "
|
|
||||||
elif default == "no":
|
|
||||||
prompt = " [y/N] "
|
|
||||||
else:
|
|
||||||
raise ValueError("invalid default answer: '%s'" % default)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
sys.stdout.write(question + prompt)
|
|
||||||
choice = input().lower()
|
|
||||||
if default is not None and choice == "":
|
|
||||||
return valid[default]
|
|
||||||
elif choice in valid:
|
|
||||||
return valid[choice]
|
|
||||||
else:
|
|
||||||
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
|
||||||
|
|
||||||
|
|
||||||
class FileType:
|
|
||||||
"""Factory for creating file object types
|
|
||||||
|
|
||||||
Instances of FileType are typically passed as type= arguments to the
|
|
||||||
ArgumentParser add_argument() method.
|
|
||||||
|
|
||||||
Keyword Arguments:
|
|
||||||
- mode -- A string indicating how the file is to be opened. Accepts the
|
|
||||||
same values as the builtin open() function.
|
|
||||||
- bufsize -- The file's desired buffer size. Accepts the same values as
|
|
||||||
the builtin open() function.
|
|
||||||
- encoding -- The file's encoding. Accepts the same values as the
|
|
||||||
builtin open() function.
|
|
||||||
- errors -- A string indicating how encoding and decoding errors are to
|
|
||||||
be handled. Accepts the same value as the builtin open() function.
|
|
||||||
|
|
||||||
Backported from CPython 3.12
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
mode: str = "r",
|
|
||||||
bufsize: int = -1,
|
|
||||||
encoding: str | None = None,
|
|
||||||
errors: str | None = None,
|
|
||||||
):
|
|
||||||
self._mode = mode
|
|
||||||
self._bufsize = bufsize
|
|
||||||
self._encoding = encoding
|
|
||||||
self._errors = errors
|
|
||||||
|
|
||||||
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
|
||||||
# the special argument "-" means sys.std{in,out}
|
|
||||||
if string == "-":
|
|
||||||
if "r" in self._mode:
|
|
||||||
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
|
||||||
elif any(c in self._mode for c in "wax"):
|
|
||||||
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
|
||||||
else:
|
|
||||||
msg = f'argument "-" with mode {self._mode}'
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
# all other arguments are used as file names
|
|
||||||
try:
|
|
||||||
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
|
||||||
except OSError as e:
|
|
||||||
message = f"can't open '{string}': {e}"
|
|
||||||
raise argparse.ArgumentTypeError(message)
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
args = self._mode, self._bufsize
|
|
||||||
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
|
||||||
args_str = ", ".join(
|
|
||||||
[repr(arg) for arg in args if arg != -1]
|
|
||||||
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
|
||||||
)
|
|
||||||
return f"{type(self).__name__}({args_str})"
|
|
||||||
|
|
||||||
|
|
||||||
def cli_detect(argv: list[str] | None = None) -> int:
|
|
||||||
"""
|
|
||||||
CLI assistant using ARGV and ArgumentParser
|
|
||||||
:param argv:
|
|
||||||
:return: 0 if everything is fine, anything else equal trouble
|
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="The Real First Universal Charset Detector. "
|
|
||||||
"Discover originating encoding used on text file. "
|
|
||||||
"Normalize text to unicode."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-v",
|
|
||||||
"--verbose",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="verbose",
|
|
||||||
help="Display complementary information about file if any. "
|
|
||||||
"Stdout will contain logs about the detection process.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-a",
|
|
||||||
"--with-alternative",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="alternatives",
|
|
||||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-n",
|
|
||||||
"--normalize",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="normalize",
|
|
||||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-m",
|
|
||||||
"--minimal",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="minimal",
|
|
||||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-r",
|
|
||||||
"--replace",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="replace",
|
|
||||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-f",
|
|
||||||
"--force",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="force",
|
|
||||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-i",
|
|
||||||
"--no-preemptive",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
dest="no_preemptive",
|
|
||||||
help="Disable looking at a charset declaration to hint the detector.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-t",
|
|
||||||
"--threshold",
|
|
||||||
action="store",
|
|
||||||
default=0.2,
|
|
||||||
type=float,
|
|
||||||
dest="threshold",
|
|
||||||
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--version",
|
|
||||||
action="version",
|
|
||||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
|
||||||
__version__,
|
|
||||||
python_version(),
|
|
||||||
unidata_version,
|
|
||||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
|
||||||
),
|
|
||||||
help="Show version information and exit.",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
if args.replace is True and args.normalize is False:
|
|
||||||
if args.files:
|
|
||||||
for my_file in args.files:
|
|
||||||
my_file.close()
|
|
||||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if args.force is True and args.replace is False:
|
|
||||||
if args.files:
|
|
||||||
for my_file in args.files:
|
|
||||||
my_file.close()
|
|
||||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
|
||||||
if args.files:
|
|
||||||
for my_file in args.files:
|
|
||||||
my_file.close()
|
|
||||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
x_ = []
|
|
||||||
|
|
||||||
for my_file in args.files:
|
|
||||||
matches = from_fp(
|
|
||||||
my_file,
|
|
||||||
threshold=args.threshold,
|
|
||||||
explain=args.verbose,
|
|
||||||
preemptive_behaviour=args.no_preemptive is False,
|
|
||||||
)
|
|
||||||
|
|
||||||
best_guess = matches.best()
|
|
||||||
|
|
||||||
if best_guess is None:
|
|
||||||
print(
|
|
||||||
'Unable to identify originating encoding for "{}". {}'.format(
|
|
||||||
my_file.name,
|
|
||||||
(
|
|
||||||
"Maybe try increasing maximum amount of chaos."
|
|
||||||
if args.threshold < 1.0
|
|
||||||
else ""
|
|
||||||
),
|
|
||||||
),
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
x_.append(
|
|
||||||
CliDetectionResult(
|
|
||||||
abspath(my_file.name),
|
|
||||||
None,
|
|
||||||
[],
|
|
||||||
[],
|
|
||||||
"Unknown",
|
|
||||||
[],
|
|
||||||
False,
|
|
||||||
1.0,
|
|
||||||
0.0,
|
|
||||||
None,
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
x_.append(
|
|
||||||
CliDetectionResult(
|
|
||||||
abspath(my_file.name),
|
|
||||||
best_guess.encoding,
|
|
||||||
best_guess.encoding_aliases,
|
|
||||||
[
|
|
||||||
cp
|
|
||||||
for cp in best_guess.could_be_from_charset
|
|
||||||
if cp != best_guess.encoding
|
|
||||||
],
|
|
||||||
best_guess.language,
|
|
||||||
best_guess.alphabets,
|
|
||||||
best_guess.bom,
|
|
||||||
best_guess.percent_chaos,
|
|
||||||
best_guess.percent_coherence,
|
|
||||||
None,
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(matches) > 1 and args.alternatives:
|
|
||||||
for el in matches:
|
|
||||||
if el != best_guess:
|
|
||||||
x_.append(
|
|
||||||
CliDetectionResult(
|
|
||||||
abspath(my_file.name),
|
|
||||||
el.encoding,
|
|
||||||
el.encoding_aliases,
|
|
||||||
[
|
|
||||||
cp
|
|
||||||
for cp in el.could_be_from_charset
|
|
||||||
if cp != el.encoding
|
|
||||||
],
|
|
||||||
el.language,
|
|
||||||
el.alphabets,
|
|
||||||
el.bom,
|
|
||||||
el.percent_chaos,
|
|
||||||
el.percent_coherence,
|
|
||||||
None,
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.normalize is True:
|
|
||||||
if best_guess.encoding.startswith("utf") is True:
|
|
||||||
print(
|
|
||||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
|
||||||
my_file.name
|
|
||||||
),
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
if my_file.closed is False:
|
|
||||||
my_file.close()
|
|
||||||
continue
|
|
||||||
|
|
||||||
dir_path = dirname(realpath(my_file.name))
|
|
||||||
file_name = basename(realpath(my_file.name))
|
|
||||||
|
|
||||||
o_: list[str] = file_name.split(".")
|
|
||||||
|
|
||||||
if args.replace is False:
|
|
||||||
o_.insert(-1, best_guess.encoding)
|
|
||||||
if my_file.closed is False:
|
|
||||||
my_file.close()
|
|
||||||
elif (
|
|
||||||
args.force is False
|
|
||||||
and query_yes_no(
|
|
||||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
|
||||||
my_file.name
|
|
||||||
),
|
|
||||||
"no",
|
|
||||||
)
|
|
||||||
is False
|
|
||||||
):
|
|
||||||
if my_file.closed is False:
|
|
||||||
my_file.close()
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
|
||||||
|
|
||||||
with open(x_[0].unicode_path, "wb") as fp:
|
|
||||||
fp.write(best_guess.output())
|
|
||||||
except OSError as e:
|
|
||||||
print(str(e), file=sys.stderr)
|
|
||||||
if my_file.closed is False:
|
|
||||||
my_file.close()
|
|
||||||
return 2
|
|
||||||
|
|
||||||
if my_file.closed is False:
|
|
||||||
my_file.close()
|
|
||||||
|
|
||||||
if args.minimal is False:
|
|
||||||
print(
|
|
||||||
dumps(
|
|
||||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
|
||||||
ensure_ascii=True,
|
|
||||||
indent=4,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
for my_file in args.files:
|
|
||||||
print(
|
|
||||||
", ".join(
|
|
||||||
[
|
|
||||||
el.encoding or "undefined"
|
|
||||||
for el in x_
|
|
||||||
if el.path == abspath(my_file.name)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli_detect()
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,80 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
from warnings import warn
|
|
||||||
|
|
||||||
from .api import from_bytes
|
|
||||||
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
|
|
||||||
|
|
||||||
# TODO: remove this check when dropping Python 3.7 support
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing_extensions import TypedDict
|
|
||||||
|
|
||||||
class ResultDict(TypedDict):
|
|
||||||
encoding: str | None
|
|
||||||
language: str
|
|
||||||
confidence: float | None
|
|
||||||
|
|
||||||
|
|
||||||
def detect(
|
|
||||||
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
|
||||||
) -> ResultDict:
|
|
||||||
"""
|
|
||||||
chardet legacy method
|
|
||||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
|
||||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
|
||||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
|
||||||
further information. Not planned for removal.
|
|
||||||
|
|
||||||
:param byte_str: The byte sequence to examine.
|
|
||||||
:param should_rename_legacy: Should we rename legacy encodings
|
|
||||||
to their more modern equivalents?
|
|
||||||
"""
|
|
||||||
if len(kwargs):
|
|
||||||
warn(
|
|
||||||
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not isinstance(byte_str, (bytearray, bytes)):
|
|
||||||
raise TypeError( # pragma: nocover
|
|
||||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(byte_str, bytearray):
|
|
||||||
byte_str = bytes(byte_str)
|
|
||||||
|
|
||||||
r = from_bytes(byte_str).best()
|
|
||||||
|
|
||||||
encoding = r.encoding if r is not None else None
|
|
||||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
|
||||||
confidence = 1.0 - r.chaos if r is not None else None
|
|
||||||
|
|
||||||
# automatically lower confidence
|
|
||||||
# on small bytes samples.
|
|
||||||
# https://github.com/jawah/charset_normalizer/issues/391
|
|
||||||
if (
|
|
||||||
confidence is not None
|
|
||||||
and confidence >= 0.9
|
|
||||||
and encoding
|
|
||||||
not in {
|
|
||||||
"utf_8",
|
|
||||||
"ascii",
|
|
||||||
}
|
|
||||||
and r.bom is False # type: ignore[union-attr]
|
|
||||||
and len(byte_str) < TOO_SMALL_SEQUENCE
|
|
||||||
):
|
|
||||||
confidence -= 0.2
|
|
||||||
|
|
||||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
|
||||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
|
||||||
if r is not None and encoding == "utf_8" and r.bom:
|
|
||||||
encoding += "_sig"
|
|
||||||
|
|
||||||
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
|
||||||
encoding = CHARDET_CORRESPONDENCE[encoding]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"encoding": encoding,
|
|
||||||
"language": language,
|
|
||||||
"confidence": confidence,
|
|
||||||
}
|
|
||||||
Binary file not shown.
@ -1,635 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from functools import lru_cache
|
|
||||||
from logging import getLogger
|
|
||||||
|
|
||||||
from .constant import (
|
|
||||||
COMMON_SAFE_ASCII_CHARACTERS,
|
|
||||||
TRACE,
|
|
||||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
|
||||||
)
|
|
||||||
from .utils import (
|
|
||||||
is_accentuated,
|
|
||||||
is_arabic,
|
|
||||||
is_arabic_isolated_form,
|
|
||||||
is_case_variable,
|
|
||||||
is_cjk,
|
|
||||||
is_emoticon,
|
|
||||||
is_hangul,
|
|
||||||
is_hiragana,
|
|
||||||
is_katakana,
|
|
||||||
is_latin,
|
|
||||||
is_punctuation,
|
|
||||||
is_separator,
|
|
||||||
is_symbol,
|
|
||||||
is_thai,
|
|
||||||
is_unprintable,
|
|
||||||
remove_accent,
|
|
||||||
unicode_range,
|
|
||||||
is_cjk_uncommon,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MessDetectorPlugin:
|
|
||||||
"""
|
|
||||||
Base abstract class used for mess detection plugins.
|
|
||||||
All detectors MUST extend and implement given methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
"""
|
|
||||||
Determine if given character should be fed in.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError # pragma: nocover
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
"""
|
|
||||||
The main routine to be executed upon character.
|
|
||||||
Insert the logic in witch the text would be considered chaotic.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError # pragma: nocover
|
|
||||||
|
|
||||||
def reset(self) -> None: # pragma: no cover
|
|
||||||
"""
|
|
||||||
Permit to reset the plugin to the initial state.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
"""
|
|
||||||
Compute the chaos ratio based on what your feed() has seen.
|
|
||||||
Must NOT be lower than 0.; No restriction gt 0.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError # pragma: nocover
|
|
||||||
|
|
||||||
|
|
||||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._punctuation_count: int = 0
|
|
||||||
self._symbol_count: int = 0
|
|
||||||
self._character_count: int = 0
|
|
||||||
|
|
||||||
self._last_printable_char: str | None = None
|
|
||||||
self._frenzy_symbol_in_word: bool = False
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return character.isprintable()
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
if (
|
|
||||||
character != self._last_printable_char
|
|
||||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
|
||||||
):
|
|
||||||
if is_punctuation(character):
|
|
||||||
self._punctuation_count += 1
|
|
||||||
elif (
|
|
||||||
character.isdigit() is False
|
|
||||||
and is_symbol(character)
|
|
||||||
and is_emoticon(character) is False
|
|
||||||
):
|
|
||||||
self._symbol_count += 2
|
|
||||||
|
|
||||||
self._last_printable_char = character
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._punctuation_count = 0
|
|
||||||
self._character_count = 0
|
|
||||||
self._symbol_count = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count == 0:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
ratio_of_punctuation: float = (
|
|
||||||
self._punctuation_count + self._symbol_count
|
|
||||||
) / self._character_count
|
|
||||||
|
|
||||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
|
||||||
|
|
||||||
|
|
||||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._character_count: int = 0
|
|
||||||
self._accentuated_count: int = 0
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return character.isalpha()
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
if is_accentuated(character):
|
|
||||||
self._accentuated_count += 1
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._character_count = 0
|
|
||||||
self._accentuated_count = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count < 8:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
|
||||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
|
||||||
|
|
||||||
|
|
||||||
class UnprintablePlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._unprintable_count: int = 0
|
|
||||||
self._character_count: int = 0
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
if is_unprintable(character):
|
|
||||||
self._unprintable_count += 1
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._unprintable_count = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count == 0:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
return (self._unprintable_count * 8) / self._character_count
|
|
||||||
|
|
||||||
|
|
||||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._successive_count: int = 0
|
|
||||||
self._character_count: int = 0
|
|
||||||
|
|
||||||
self._last_latin_character: str | None = None
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return character.isalpha() and is_latin(character)
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
if (
|
|
||||||
self._last_latin_character is not None
|
|
||||||
and is_accentuated(character)
|
|
||||||
and is_accentuated(self._last_latin_character)
|
|
||||||
):
|
|
||||||
if character.isupper() and self._last_latin_character.isupper():
|
|
||||||
self._successive_count += 1
|
|
||||||
# Worse if its the same char duplicated with different accent.
|
|
||||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
|
||||||
self._successive_count += 1
|
|
||||||
self._last_latin_character = character
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._successive_count = 0
|
|
||||||
self._character_count = 0
|
|
||||||
self._last_latin_character = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count == 0:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
return (self._successive_count * 2) / self._character_count
|
|
||||||
|
|
||||||
|
|
||||||
class SuspiciousRange(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._suspicious_successive_range_count: int = 0
|
|
||||||
self._character_count: int = 0
|
|
||||||
self._last_printable_seen: str | None = None
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return character.isprintable()
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
if (
|
|
||||||
character.isspace()
|
|
||||||
or is_punctuation(character)
|
|
||||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
|
||||||
):
|
|
||||||
self._last_printable_seen = None
|
|
||||||
return
|
|
||||||
|
|
||||||
if self._last_printable_seen is None:
|
|
||||||
self._last_printable_seen = character
|
|
||||||
return
|
|
||||||
|
|
||||||
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
|
||||||
unicode_range_b: str | None = unicode_range(character)
|
|
||||||
|
|
||||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
|
||||||
self._suspicious_successive_range_count += 1
|
|
||||||
|
|
||||||
self._last_printable_seen = character
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._character_count = 0
|
|
||||||
self._suspicious_successive_range_count = 0
|
|
||||||
self._last_printable_seen = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count <= 13:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
ratio_of_suspicious_range_usage: float = (
|
|
||||||
self._suspicious_successive_range_count * 2
|
|
||||||
) / self._character_count
|
|
||||||
|
|
||||||
return ratio_of_suspicious_range_usage
|
|
||||||
|
|
||||||
|
|
||||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._word_count: int = 0
|
|
||||||
self._bad_word_count: int = 0
|
|
||||||
self._foreign_long_count: int = 0
|
|
||||||
|
|
||||||
self._is_current_word_bad: bool = False
|
|
||||||
self._foreign_long_watch: bool = False
|
|
||||||
|
|
||||||
self._character_count: int = 0
|
|
||||||
self._bad_character_count: int = 0
|
|
||||||
|
|
||||||
self._buffer: str = ""
|
|
||||||
self._buffer_accent_count: int = 0
|
|
||||||
self._buffer_glyph_count: int = 0
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
if character.isalpha():
|
|
||||||
self._buffer += character
|
|
||||||
if is_accentuated(character):
|
|
||||||
self._buffer_accent_count += 1
|
|
||||||
if (
|
|
||||||
self._foreign_long_watch is False
|
|
||||||
and (is_latin(character) is False or is_accentuated(character))
|
|
||||||
and is_cjk(character) is False
|
|
||||||
and is_hangul(character) is False
|
|
||||||
and is_katakana(character) is False
|
|
||||||
and is_hiragana(character) is False
|
|
||||||
and is_thai(character) is False
|
|
||||||
):
|
|
||||||
self._foreign_long_watch = True
|
|
||||||
if (
|
|
||||||
is_cjk(character)
|
|
||||||
or is_hangul(character)
|
|
||||||
or is_katakana(character)
|
|
||||||
or is_hiragana(character)
|
|
||||||
or is_thai(character)
|
|
||||||
):
|
|
||||||
self._buffer_glyph_count += 1
|
|
||||||
return
|
|
||||||
if not self._buffer:
|
|
||||||
return
|
|
||||||
if (
|
|
||||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
|
||||||
) and self._buffer:
|
|
||||||
self._word_count += 1
|
|
||||||
buffer_length: int = len(self._buffer)
|
|
||||||
|
|
||||||
self._character_count += buffer_length
|
|
||||||
|
|
||||||
if buffer_length >= 4:
|
|
||||||
if self._buffer_accent_count / buffer_length >= 0.5:
|
|
||||||
self._is_current_word_bad = True
|
|
||||||
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
|
||||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
|
||||||
elif (
|
|
||||||
is_accentuated(self._buffer[-1])
|
|
||||||
and self._buffer[-1].isupper()
|
|
||||||
and all(_.isupper() for _ in self._buffer) is False
|
|
||||||
):
|
|
||||||
self._foreign_long_count += 1
|
|
||||||
self._is_current_word_bad = True
|
|
||||||
elif self._buffer_glyph_count == 1:
|
|
||||||
self._is_current_word_bad = True
|
|
||||||
self._foreign_long_count += 1
|
|
||||||
if buffer_length >= 24 and self._foreign_long_watch:
|
|
||||||
camel_case_dst = [
|
|
||||||
i
|
|
||||||
for c, i in zip(self._buffer, range(0, buffer_length))
|
|
||||||
if c.isupper()
|
|
||||||
]
|
|
||||||
probable_camel_cased: bool = False
|
|
||||||
|
|
||||||
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
|
||||||
probable_camel_cased = True
|
|
||||||
|
|
||||||
if not probable_camel_cased:
|
|
||||||
self._foreign_long_count += 1
|
|
||||||
self._is_current_word_bad = True
|
|
||||||
|
|
||||||
if self._is_current_word_bad:
|
|
||||||
self._bad_word_count += 1
|
|
||||||
self._bad_character_count += len(self._buffer)
|
|
||||||
self._is_current_word_bad = False
|
|
||||||
|
|
||||||
self._foreign_long_watch = False
|
|
||||||
self._buffer = ""
|
|
||||||
self._buffer_accent_count = 0
|
|
||||||
self._buffer_glyph_count = 0
|
|
||||||
elif (
|
|
||||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
|
||||||
and character.isdigit() is False
|
|
||||||
and is_symbol(character)
|
|
||||||
):
|
|
||||||
self._is_current_word_bad = True
|
|
||||||
self._buffer += character
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._buffer = ""
|
|
||||||
self._is_current_word_bad = False
|
|
||||||
self._foreign_long_watch = False
|
|
||||||
self._bad_word_count = 0
|
|
||||||
self._word_count = 0
|
|
||||||
self._character_count = 0
|
|
||||||
self._bad_character_count = 0
|
|
||||||
self._foreign_long_count = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
return self._bad_character_count / self._character_count
|
|
||||||
|
|
||||||
|
|
||||||
class CjkUncommonPlugin(MessDetectorPlugin):
|
|
||||||
"""
|
|
||||||
Detect messy CJK text that probably means nothing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._character_count: int = 0
|
|
||||||
self._uncommon_count: int = 0
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return is_cjk(character)
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
if is_cjk_uncommon(character):
|
|
||||||
self._uncommon_count += 1
|
|
||||||
return
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._character_count = 0
|
|
||||||
self._uncommon_count = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count < 8:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
|
||||||
|
|
||||||
# we can be pretty sure it's garbage when uncommon characters are widely
|
|
||||||
# used. otherwise it could just be traditional chinese for example.
|
|
||||||
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
|
||||||
|
|
||||||
|
|
||||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._buf: bool = False
|
|
||||||
|
|
||||||
self._character_count_since_last_sep: int = 0
|
|
||||||
|
|
||||||
self._successive_upper_lower_count: int = 0
|
|
||||||
self._successive_upper_lower_count_final: int = 0
|
|
||||||
|
|
||||||
self._character_count: int = 0
|
|
||||||
|
|
||||||
self._last_alpha_seen: str | None = None
|
|
||||||
self._current_ascii_only: bool = True
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
is_concerned = character.isalpha() and is_case_variable(character)
|
|
||||||
chunk_sep = is_concerned is False
|
|
||||||
|
|
||||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
|
||||||
if (
|
|
||||||
self._character_count_since_last_sep <= 64
|
|
||||||
and character.isdigit() is False
|
|
||||||
and self._current_ascii_only is False
|
|
||||||
):
|
|
||||||
self._successive_upper_lower_count_final += (
|
|
||||||
self._successive_upper_lower_count
|
|
||||||
)
|
|
||||||
|
|
||||||
self._successive_upper_lower_count = 0
|
|
||||||
self._character_count_since_last_sep = 0
|
|
||||||
self._last_alpha_seen = None
|
|
||||||
self._buf = False
|
|
||||||
self._character_count += 1
|
|
||||||
self._current_ascii_only = True
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
if self._current_ascii_only is True and character.isascii() is False:
|
|
||||||
self._current_ascii_only = False
|
|
||||||
|
|
||||||
if self._last_alpha_seen is not None:
|
|
||||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
|
||||||
character.islower() and self._last_alpha_seen.isupper()
|
|
||||||
):
|
|
||||||
if self._buf is True:
|
|
||||||
self._successive_upper_lower_count += 2
|
|
||||||
self._buf = False
|
|
||||||
else:
|
|
||||||
self._buf = True
|
|
||||||
else:
|
|
||||||
self._buf = False
|
|
||||||
|
|
||||||
self._character_count += 1
|
|
||||||
self._character_count_since_last_sep += 1
|
|
||||||
self._last_alpha_seen = character
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._character_count = 0
|
|
||||||
self._character_count_since_last_sep = 0
|
|
||||||
self._successive_upper_lower_count = 0
|
|
||||||
self._successive_upper_lower_count_final = 0
|
|
||||||
self._last_alpha_seen = None
|
|
||||||
self._buf = False
|
|
||||||
self._current_ascii_only = True
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count == 0:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
return self._successive_upper_lower_count_final / self._character_count
|
|
||||||
|
|
||||||
|
|
||||||
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._character_count: int = 0
|
|
||||||
self._isolated_form_count: int = 0
|
|
||||||
|
|
||||||
def reset(self) -> None: # Abstract
|
|
||||||
self._character_count = 0
|
|
||||||
self._isolated_form_count = 0
|
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
|
||||||
return is_arabic(character)
|
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
|
||||||
self._character_count += 1
|
|
||||||
|
|
||||||
if is_arabic_isolated_form(character):
|
|
||||||
self._isolated_form_count += 1
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ratio(self) -> float:
|
|
||||||
if self._character_count < 8:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
|
||||||
|
|
||||||
return isolated_form_usage
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1024)
|
|
||||||
def is_suspiciously_successive_range(
|
|
||||||
unicode_range_a: str | None, unicode_range_b: str | None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
|
||||||
"""
|
|
||||||
if unicode_range_a is None or unicode_range_b is None:
|
|
||||||
return True
|
|
||||||
|
|
||||||
if unicode_range_a == unicode_range_b:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Latin characters can be accompanied with a combining diacritical mark
|
|
||||||
# eg. Vietnamese.
|
|
||||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
|
||||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
|
||||||
):
|
|
||||||
return False
|
|
||||||
|
|
||||||
keywords_range_a, keywords_range_b = (
|
|
||||||
unicode_range_a.split(" "),
|
|
||||||
unicode_range_b.split(" "),
|
|
||||||
)
|
|
||||||
|
|
||||||
for el in keywords_range_a:
|
|
||||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
|
||||||
continue
|
|
||||||
if el in keywords_range_b:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Japanese Exception
|
|
||||||
range_a_jp_chars, range_b_jp_chars = (
|
|
||||||
unicode_range_a
|
|
||||||
in (
|
|
||||||
"Hiragana",
|
|
||||||
"Katakana",
|
|
||||||
),
|
|
||||||
unicode_range_b in ("Hiragana", "Katakana"),
|
|
||||||
)
|
|
||||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
|
||||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
|
||||||
):
|
|
||||||
return False
|
|
||||||
if range_a_jp_chars and range_b_jp_chars:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
|
||||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
|
||||||
return False
|
|
||||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
|
||||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
|
||||||
unicode_range_a in ["Katakana", "Hiragana"]
|
|
||||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
|
||||||
):
|
|
||||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
|
||||||
return False
|
|
||||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
|
||||||
return False
|
|
||||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=2048)
|
|
||||||
def mess_ratio(
|
|
||||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
|
||||||
) -> float:
|
|
||||||
"""
|
|
||||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
|
||||||
"""
|
|
||||||
|
|
||||||
detectors: list[MessDetectorPlugin] = [
|
|
||||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
|
||||||
]
|
|
||||||
|
|
||||||
length: int = len(decoded_sequence) + 1
|
|
||||||
|
|
||||||
mean_mess_ratio: float = 0.0
|
|
||||||
|
|
||||||
if length < 512:
|
|
||||||
intermediary_mean_mess_ratio_calc: int = 32
|
|
||||||
elif length <= 1024:
|
|
||||||
intermediary_mean_mess_ratio_calc = 64
|
|
||||||
else:
|
|
||||||
intermediary_mean_mess_ratio_calc = 128
|
|
||||||
|
|
||||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
|
||||||
for detector in detectors:
|
|
||||||
if detector.eligible(character):
|
|
||||||
detector.feed(character)
|
|
||||||
|
|
||||||
if (
|
|
||||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
|
||||||
) or index == length - 1:
|
|
||||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
|
||||||
|
|
||||||
if mean_mess_ratio >= maximum_threshold:
|
|
||||||
break
|
|
||||||
|
|
||||||
if debug:
|
|
||||||
logger = getLogger("charset_normalizer")
|
|
||||||
|
|
||||||
logger.log(
|
|
||||||
TRACE,
|
|
||||||
"Mess-detector extended-analysis start. "
|
|
||||||
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
|
||||||
f"maximum_threshold={maximum_threshold}",
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(decoded_sequence) > 16:
|
|
||||||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
|
||||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
|
||||||
|
|
||||||
for dt in detectors:
|
|
||||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
|
||||||
|
|
||||||
return round(mean_mess_ratio, 3)
|
|
||||||
Binary file not shown.
@ -1,360 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from encodings.aliases import aliases
|
|
||||||
from hashlib import sha256
|
|
||||||
from json import dumps
|
|
||||||
from re import sub
|
|
||||||
from typing import Any, Iterator, List, Tuple
|
|
||||||
|
|
||||||
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
|
||||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
|
||||||
|
|
||||||
|
|
||||||
class CharsetMatch:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
payload: bytes,
|
|
||||||
guessed_encoding: str,
|
|
||||||
mean_mess_ratio: float,
|
|
||||||
has_sig_or_bom: bool,
|
|
||||||
languages: CoherenceMatches,
|
|
||||||
decoded_payload: str | None = None,
|
|
||||||
preemptive_declaration: str | None = None,
|
|
||||||
):
|
|
||||||
self._payload: bytes = payload
|
|
||||||
|
|
||||||
self._encoding: str = guessed_encoding
|
|
||||||
self._mean_mess_ratio: float = mean_mess_ratio
|
|
||||||
self._languages: CoherenceMatches = languages
|
|
||||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
|
||||||
self._unicode_ranges: list[str] | None = None
|
|
||||||
|
|
||||||
self._leaves: list[CharsetMatch] = []
|
|
||||||
self._mean_coherence_ratio: float = 0.0
|
|
||||||
|
|
||||||
self._output_payload: bytes | None = None
|
|
||||||
self._output_encoding: str | None = None
|
|
||||||
|
|
||||||
self._string: str | None = decoded_payload
|
|
||||||
|
|
||||||
self._preemptive_declaration: str | None = preemptive_declaration
|
|
||||||
|
|
||||||
def __eq__(self, other: object) -> bool:
|
|
||||||
if not isinstance(other, CharsetMatch):
|
|
||||||
if isinstance(other, str):
|
|
||||||
return iana_name(other) == self.encoding
|
|
||||||
return False
|
|
||||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
|
||||||
|
|
||||||
def __lt__(self, other: object) -> bool:
|
|
||||||
"""
|
|
||||||
Implemented to make sorted available upon CharsetMatches items.
|
|
||||||
"""
|
|
||||||
if not isinstance(other, CharsetMatch):
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
|
||||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
|
||||||
|
|
||||||
# Below 1% difference --> Use Coherence
|
|
||||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
|
||||||
return self.coherence > other.coherence
|
|
||||||
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
|
||||||
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
|
||||||
# preserve RAM usage!
|
|
||||||
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
|
||||||
return self.chaos < other.chaos
|
|
||||||
return self.multi_byte_usage > other.multi_byte_usage
|
|
||||||
|
|
||||||
return self.chaos < other.chaos
|
|
||||||
|
|
||||||
@property
|
|
||||||
def multi_byte_usage(self) -> float:
|
|
||||||
return 1.0 - (len(str(self)) / len(self.raw))
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
|
||||||
# Lazy Str Loading
|
|
||||||
if self._string is None:
|
|
||||||
self._string = str(self._payload, self._encoding, "strict")
|
|
||||||
return self._string
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
|
||||||
|
|
||||||
def add_submatch(self, other: CharsetMatch) -> None:
|
|
||||||
if not isinstance(other, CharsetMatch) or other == self:
|
|
||||||
raise ValueError(
|
|
||||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
|
||||||
other.__class__
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
other._string = None # Unload RAM usage; dirty trick.
|
|
||||||
self._leaves.append(other)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def encoding(self) -> str:
|
|
||||||
return self._encoding
|
|
||||||
|
|
||||||
@property
|
|
||||||
def encoding_aliases(self) -> list[str]:
|
|
||||||
"""
|
|
||||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
|
||||||
"""
|
|
||||||
also_known_as: list[str] = []
|
|
||||||
for u, p in aliases.items():
|
|
||||||
if self.encoding == u:
|
|
||||||
also_known_as.append(p)
|
|
||||||
elif self.encoding == p:
|
|
||||||
also_known_as.append(u)
|
|
||||||
return also_known_as
|
|
||||||
|
|
||||||
@property
|
|
||||||
def bom(self) -> bool:
|
|
||||||
return self._has_sig_or_bom
|
|
||||||
|
|
||||||
@property
|
|
||||||
def byte_order_mark(self) -> bool:
|
|
||||||
return self._has_sig_or_bom
|
|
||||||
|
|
||||||
@property
|
|
||||||
def languages(self) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return the complete list of possible languages found in decoded sequence.
|
|
||||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
|
||||||
"""
|
|
||||||
return [e[0] for e in self._languages]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def language(self) -> str:
|
|
||||||
"""
|
|
||||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
|
||||||
"Unknown".
|
|
||||||
"""
|
|
||||||
if not self._languages:
|
|
||||||
# Trying to infer the language based on the given encoding
|
|
||||||
# Its either English or we should not pronounce ourselves in certain cases.
|
|
||||||
if "ascii" in self.could_be_from_charset:
|
|
||||||
return "English"
|
|
||||||
|
|
||||||
# doing it there to avoid circular import
|
|
||||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
|
||||||
|
|
||||||
languages = (
|
|
||||||
mb_encoding_languages(self.encoding)
|
|
||||||
if is_multi_byte_encoding(self.encoding)
|
|
||||||
else encoding_languages(self.encoding)
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(languages) == 0 or "Latin Based" in languages:
|
|
||||||
return "Unknown"
|
|
||||||
|
|
||||||
return languages[0]
|
|
||||||
|
|
||||||
return self._languages[0][0]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def chaos(self) -> float:
|
|
||||||
return self._mean_mess_ratio
|
|
||||||
|
|
||||||
@property
|
|
||||||
def coherence(self) -> float:
|
|
||||||
if not self._languages:
|
|
||||||
return 0.0
|
|
||||||
return self._languages[0][1]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def percent_chaos(self) -> float:
|
|
||||||
return round(self.chaos * 100, ndigits=3)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def percent_coherence(self) -> float:
|
|
||||||
return round(self.coherence * 100, ndigits=3)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def raw(self) -> bytes:
|
|
||||||
"""
|
|
||||||
Original untouched bytes.
|
|
||||||
"""
|
|
||||||
return self._payload
|
|
||||||
|
|
||||||
@property
|
|
||||||
def submatch(self) -> list[CharsetMatch]:
|
|
||||||
return self._leaves
|
|
||||||
|
|
||||||
@property
|
|
||||||
def has_submatch(self) -> bool:
|
|
||||||
return len(self._leaves) > 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alphabets(self) -> list[str]:
|
|
||||||
if self._unicode_ranges is not None:
|
|
||||||
return self._unicode_ranges
|
|
||||||
# list detected ranges
|
|
||||||
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
|
||||||
# filter and sort
|
|
||||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
|
||||||
return self._unicode_ranges
|
|
||||||
|
|
||||||
@property
|
|
||||||
def could_be_from_charset(self) -> list[str]:
|
|
||||||
"""
|
|
||||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
|
||||||
encoding.
|
|
||||||
This list does include the encoding available in property 'encoding'.
|
|
||||||
"""
|
|
||||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
|
||||||
|
|
||||||
def output(self, encoding: str = "utf_8") -> bytes:
|
|
||||||
"""
|
|
||||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
|
||||||
Any errors will be simply ignored by the encoder NOT replaced.
|
|
||||||
"""
|
|
||||||
if self._output_encoding is None or self._output_encoding != encoding:
|
|
||||||
self._output_encoding = encoding
|
|
||||||
decoded_string = str(self)
|
|
||||||
if (
|
|
||||||
self._preemptive_declaration is not None
|
|
||||||
and self._preemptive_declaration.lower()
|
|
||||||
not in ["utf-8", "utf8", "utf_8"]
|
|
||||||
):
|
|
||||||
patched_header = sub(
|
|
||||||
RE_POSSIBLE_ENCODING_INDICATION,
|
|
||||||
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
|
||||||
m.groups()[0],
|
|
||||||
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
|
||||||
),
|
|
||||||
decoded_string[:8192],
|
|
||||||
count=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
decoded_string = patched_header + decoded_string[8192:]
|
|
||||||
|
|
||||||
self._output_payload = decoded_string.encode(encoding, "replace")
|
|
||||||
|
|
||||||
return self._output_payload # type: ignore
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fingerprint(self) -> str:
|
|
||||||
"""
|
|
||||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
|
||||||
"""
|
|
||||||
return sha256(self.output()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
class CharsetMatches:
|
|
||||||
"""
|
|
||||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
|
||||||
Act like a list(iterable) but does not implements all related methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, results: list[CharsetMatch] | None = None):
|
|
||||||
self._results: list[CharsetMatch] = sorted(results) if results else []
|
|
||||||
|
|
||||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
|
||||||
yield from self._results
|
|
||||||
|
|
||||||
def __getitem__(self, item: int | str) -> CharsetMatch:
|
|
||||||
"""
|
|
||||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
|
||||||
Raise KeyError upon invalid index or encoding not present in results.
|
|
||||||
"""
|
|
||||||
if isinstance(item, int):
|
|
||||||
return self._results[item]
|
|
||||||
if isinstance(item, str):
|
|
||||||
item = iana_name(item, False)
|
|
||||||
for result in self._results:
|
|
||||||
if item in result.could_be_from_charset:
|
|
||||||
return result
|
|
||||||
raise KeyError
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return len(self._results)
|
|
||||||
|
|
||||||
def __bool__(self) -> bool:
|
|
||||||
return len(self._results) > 0
|
|
||||||
|
|
||||||
def append(self, item: CharsetMatch) -> None:
|
|
||||||
"""
|
|
||||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
|
||||||
Can be inserted as a submatch.
|
|
||||||
"""
|
|
||||||
if not isinstance(item, CharsetMatch):
|
|
||||||
raise ValueError(
|
|
||||||
"Cannot append instance '{}' to CharsetMatches".format(
|
|
||||||
str(item.__class__)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
|
||||||
if len(item.raw) < TOO_BIG_SEQUENCE:
|
|
||||||
for match in self._results:
|
|
||||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
|
||||||
match.add_submatch(item)
|
|
||||||
return
|
|
||||||
self._results.append(item)
|
|
||||||
self._results = sorted(self._results)
|
|
||||||
|
|
||||||
def best(self) -> CharsetMatch | None:
|
|
||||||
"""
|
|
||||||
Simply return the first match. Strict equivalent to matches[0].
|
|
||||||
"""
|
|
||||||
if not self._results:
|
|
||||||
return None
|
|
||||||
return self._results[0]
|
|
||||||
|
|
||||||
def first(self) -> CharsetMatch | None:
|
|
||||||
"""
|
|
||||||
Redundant method, call the method best(). Kept for BC reasons.
|
|
||||||
"""
|
|
||||||
return self.best()
|
|
||||||
|
|
||||||
|
|
||||||
CoherenceMatch = Tuple[str, float]
|
|
||||||
CoherenceMatches = List[CoherenceMatch]
|
|
||||||
|
|
||||||
|
|
||||||
class CliDetectionResult:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
path: str,
|
|
||||||
encoding: str | None,
|
|
||||||
encoding_aliases: list[str],
|
|
||||||
alternative_encodings: list[str],
|
|
||||||
language: str,
|
|
||||||
alphabets: list[str],
|
|
||||||
has_sig_or_bom: bool,
|
|
||||||
chaos: float,
|
|
||||||
coherence: float,
|
|
||||||
unicode_path: str | None,
|
|
||||||
is_preferred: bool,
|
|
||||||
):
|
|
||||||
self.path: str = path
|
|
||||||
self.unicode_path: str | None = unicode_path
|
|
||||||
self.encoding: str | None = encoding
|
|
||||||
self.encoding_aliases: list[str] = encoding_aliases
|
|
||||||
self.alternative_encodings: list[str] = alternative_encodings
|
|
||||||
self.language: str = language
|
|
||||||
self.alphabets: list[str] = alphabets
|
|
||||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
|
||||||
self.chaos: float = chaos
|
|
||||||
self.coherence: float = coherence
|
|
||||||
self.is_preferred: bool = is_preferred
|
|
||||||
|
|
||||||
@property
|
|
||||||
def __dict__(self) -> dict[str, Any]: # type: ignore
|
|
||||||
return {
|
|
||||||
"path": self.path,
|
|
||||||
"encoding": self.encoding,
|
|
||||||
"encoding_aliases": self.encoding_aliases,
|
|
||||||
"alternative_encodings": self.alternative_encodings,
|
|
||||||
"language": self.language,
|
|
||||||
"alphabets": self.alphabets,
|
|
||||||
"has_sig_or_bom": self.has_sig_or_bom,
|
|
||||||
"chaos": self.chaos,
|
|
||||||
"coherence": self.coherence,
|
|
||||||
"unicode_path": self.unicode_path,
|
|
||||||
"is_preferred": self.is_preferred,
|
|
||||||
}
|
|
||||||
|
|
||||||
def to_json(self) -> str:
|
|
||||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
"""
|
|
||||||
Expose version
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
__version__ = "3.4.4"
|
|
||||||
VERSION = __version__.split(".")
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import sys
|
|
||||||
|
|
||||||
try:
|
|
||||||
from ._version import version as __version__
|
|
||||||
except ImportError:
|
|
||||||
__version__ = 'unknown'
|
|
||||||
|
|
||||||
__all__ = ['easter', 'parser', 'relativedelta', 'rrule', 'tz',
|
|
||||||
'utils', 'zoneinfo']
|
|
||||||
|
|
||||||
def __getattr__(name):
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
if name in __all__:
|
|
||||||
return importlib.import_module("." + name, __name__)
|
|
||||||
raise AttributeError(
|
|
||||||
"module {!r} has not attribute {!r}".format(__name__, name)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def __dir__():
|
|
||||||
# __dir__ should include all the lazy-importable modules as well.
|
|
||||||
return [x for x in globals() if x not in sys.modules] + __all__
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,43 +0,0 @@
|
|||||||
"""
|
|
||||||
Common code used in multiple modules.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class weekday(object):
|
|
||||||
__slots__ = ["weekday", "n"]
|
|
||||||
|
|
||||||
def __init__(self, weekday, n=None):
|
|
||||||
self.weekday = weekday
|
|
||||||
self.n = n
|
|
||||||
|
|
||||||
def __call__(self, n):
|
|
||||||
if n == self.n:
|
|
||||||
return self
|
|
||||||
else:
|
|
||||||
return self.__class__(self.weekday, n)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
try:
|
|
||||||
if self.weekday != other.weekday or self.n != other.n:
|
|
||||||
return False
|
|
||||||
except AttributeError:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return hash((
|
|
||||||
self.weekday,
|
|
||||||
self.n,
|
|
||||||
))
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return not (self == other)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday]
|
|
||||||
if not self.n:
|
|
||||||
return s
|
|
||||||
else:
|
|
||||||
return "%s(%+d)" % (s, self.n)
|
|
||||||
|
|
||||||
# vim:ts=4:sw=4:et
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
# file generated by setuptools_scm
|
|
||||||
# don't change, don't track in version control
|
|
||||||
__version__ = version = '2.9.0.post0'
|
|
||||||
__version_tuple__ = version_tuple = (2, 9, 0)
|
|
||||||
@ -1,89 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This module offers a generic Easter computing method for any given year, using
|
|
||||||
Western, Orthodox or Julian algorithms.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
__all__ = ["easter", "EASTER_JULIAN", "EASTER_ORTHODOX", "EASTER_WESTERN"]
|
|
||||||
|
|
||||||
EASTER_JULIAN = 1
|
|
||||||
EASTER_ORTHODOX = 2
|
|
||||||
EASTER_WESTERN = 3
|
|
||||||
|
|
||||||
|
|
||||||
def easter(year, method=EASTER_WESTERN):
|
|
||||||
"""
|
|
||||||
This method was ported from the work done by GM Arts,
|
|
||||||
on top of the algorithm by Claus Tondering, which was
|
|
||||||
based in part on the algorithm of Ouding (1940), as
|
|
||||||
quoted in "Explanatory Supplement to the Astronomical
|
|
||||||
Almanac", P. Kenneth Seidelmann, editor.
|
|
||||||
|
|
||||||
This algorithm implements three different Easter
|
|
||||||
calculation methods:
|
|
||||||
|
|
||||||
1. Original calculation in Julian calendar, valid in
|
|
||||||
dates after 326 AD
|
|
||||||
2. Original method, with date converted to Gregorian
|
|
||||||
calendar, valid in years 1583 to 4099
|
|
||||||
3. Revised method, in Gregorian calendar, valid in
|
|
||||||
years 1583 to 4099 as well
|
|
||||||
|
|
||||||
These methods are represented by the constants:
|
|
||||||
|
|
||||||
* ``EASTER_JULIAN = 1``
|
|
||||||
* ``EASTER_ORTHODOX = 2``
|
|
||||||
* ``EASTER_WESTERN = 3``
|
|
||||||
|
|
||||||
The default method is method 3.
|
|
||||||
|
|
||||||
More about the algorithm may be found at:
|
|
||||||
|
|
||||||
`GM Arts: Easter Algorithms <http://www.gmarts.org/index.php?go=415>`_
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
`The Calendar FAQ: Easter <https://www.tondering.dk/claus/cal/easter.php>`_
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not (1 <= method <= 3):
|
|
||||||
raise ValueError("invalid method")
|
|
||||||
|
|
||||||
# g - Golden year - 1
|
|
||||||
# c - Century
|
|
||||||
# h - (23 - Epact) mod 30
|
|
||||||
# i - Number of days from March 21 to Paschal Full Moon
|
|
||||||
# j - Weekday for PFM (0=Sunday, etc)
|
|
||||||
# p - Number of days from March 21 to Sunday on or before PFM
|
|
||||||
# (-6 to 28 methods 1 & 3, to 56 for method 2)
|
|
||||||
# e - Extra days to add for method 2 (converting Julian
|
|
||||||
# date to Gregorian date)
|
|
||||||
|
|
||||||
y = year
|
|
||||||
g = y % 19
|
|
||||||
e = 0
|
|
||||||
if method < 3:
|
|
||||||
# Old method
|
|
||||||
i = (19*g + 15) % 30
|
|
||||||
j = (y + y//4 + i) % 7
|
|
||||||
if method == 2:
|
|
||||||
# Extra dates to convert Julian to Gregorian date
|
|
||||||
e = 10
|
|
||||||
if y > 1600:
|
|
||||||
e = e + y//100 - 16 - (y//100 - 16)//4
|
|
||||||
else:
|
|
||||||
# New method
|
|
||||||
c = y//100
|
|
||||||
h = (c - c//4 - (8*c + 13)//25 + 19*g + 15) % 30
|
|
||||||
i = h - (h//28)*(1 - (h//28)*(29//(h + 1))*((21 - g)//11))
|
|
||||||
j = (y + y//4 + i + 2 - c + c//4) % 7
|
|
||||||
|
|
||||||
# p can be from -6 to 56 corresponding to dates 22 March to 23 May
|
|
||||||
# (later dates apply to method 2, although 23 May never actually occurs)
|
|
||||||
p = i - j + e
|
|
||||||
d = 1 + (p + 27 + (p + 6)//40) % 31
|
|
||||||
m = 3 + (p + 26)//30
|
|
||||||
return datetime.date(int(y), int(m), int(d))
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from ._parser import parse, parser, parserinfo, ParserError
|
|
||||||
from ._parser import DEFAULTPARSER, DEFAULTTZPARSER
|
|
||||||
from ._parser import UnknownTimezoneWarning
|
|
||||||
|
|
||||||
from ._parser import __doc__
|
|
||||||
|
|
||||||
from .isoparser import isoparser, isoparse
|
|
||||||
|
|
||||||
__all__ = ['parse', 'parser', 'parserinfo',
|
|
||||||
'isoparse', 'isoparser',
|
|
||||||
'ParserError',
|
|
||||||
'UnknownTimezoneWarning']
|
|
||||||
|
|
||||||
|
|
||||||
###
|
|
||||||
# Deprecate portions of the private interface so that downstream code that
|
|
||||||
# is improperly relying on it is given *some* notice.
|
|
||||||
|
|
||||||
|
|
||||||
def __deprecated_private_func(f):
|
|
||||||
from functools import wraps
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
msg = ('{name} is a private function and may break without warning, '
|
|
||||||
'it will be moved and or renamed in future versions.')
|
|
||||||
msg = msg.format(name=f.__name__)
|
|
||||||
|
|
||||||
@wraps(f)
|
|
||||||
def deprecated_func(*args, **kwargs):
|
|
||||||
warnings.warn(msg, DeprecationWarning)
|
|
||||||
return f(*args, **kwargs)
|
|
||||||
|
|
||||||
return deprecated_func
|
|
||||||
|
|
||||||
def __deprecate_private_class(c):
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
msg = ('{name} is a private class and may break without warning, '
|
|
||||||
'it will be moved and or renamed in future versions.')
|
|
||||||
msg = msg.format(name=c.__name__)
|
|
||||||
|
|
||||||
class private_class(c):
|
|
||||||
__doc__ = c.__doc__
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
warnings.warn(msg, DeprecationWarning)
|
|
||||||
super(private_class, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
private_class.__name__ = c.__name__
|
|
||||||
|
|
||||||
return private_class
|
|
||||||
|
|
||||||
|
|
||||||
from ._parser import _timelex, _resultbase
|
|
||||||
from ._parser import _tzparser, _parsetz
|
|
||||||
|
|
||||||
_timelex = __deprecate_private_class(_timelex)
|
|
||||||
_tzparser = __deprecate_private_class(_tzparser)
|
|
||||||
_resultbase = __deprecate_private_class(_resultbase)
|
|
||||||
_parsetz = __deprecated_private_func(_parsetz)
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -1,416 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This module offers a parser for ISO-8601 strings
|
|
||||||
|
|
||||||
It is intended to support all valid date, time and datetime formats per the
|
|
||||||
ISO-8601 specification.
|
|
||||||
|
|
||||||
..versionadded:: 2.7.0
|
|
||||||
"""
|
|
||||||
from datetime import datetime, timedelta, time, date
|
|
||||||
import calendar
|
|
||||||
from dateutil import tz
|
|
||||||
|
|
||||||
from functools import wraps
|
|
||||||
|
|
||||||
import re
|
|
||||||
import six
|
|
||||||
|
|
||||||
__all__ = ["isoparse", "isoparser"]
|
|
||||||
|
|
||||||
|
|
||||||
def _takes_ascii(f):
|
|
||||||
@wraps(f)
|
|
||||||
def func(self, str_in, *args, **kwargs):
|
|
||||||
# If it's a stream, read the whole thing
|
|
||||||
str_in = getattr(str_in, 'read', lambda: str_in)()
|
|
||||||
|
|
||||||
# If it's unicode, turn it into bytes, since ISO-8601 only covers ASCII
|
|
||||||
if isinstance(str_in, six.text_type):
|
|
||||||
# ASCII is the same in UTF-8
|
|
||||||
try:
|
|
||||||
str_in = str_in.encode('ascii')
|
|
||||||
except UnicodeEncodeError as e:
|
|
||||||
msg = 'ISO-8601 strings should contain only ASCII characters'
|
|
||||||
six.raise_from(ValueError(msg), e)
|
|
||||||
|
|
||||||
return f(self, str_in, *args, **kwargs)
|
|
||||||
|
|
||||||
return func
|
|
||||||
|
|
||||||
|
|
||||||
class isoparser(object):
|
|
||||||
def __init__(self, sep=None):
|
|
||||||
"""
|
|
||||||
:param sep:
|
|
||||||
A single character that separates date and time portions. If
|
|
||||||
``None``, the parser will accept any single character.
|
|
||||||
For strict ISO-8601 adherence, pass ``'T'``.
|
|
||||||
"""
|
|
||||||
if sep is not None:
|
|
||||||
if (len(sep) != 1 or ord(sep) >= 128 or sep in '0123456789'):
|
|
||||||
raise ValueError('Separator must be a single, non-numeric ' +
|
|
||||||
'ASCII character')
|
|
||||||
|
|
||||||
sep = sep.encode('ascii')
|
|
||||||
|
|
||||||
self._sep = sep
|
|
||||||
|
|
||||||
@_takes_ascii
|
|
||||||
def isoparse(self, dt_str):
|
|
||||||
"""
|
|
||||||
Parse an ISO-8601 datetime string into a :class:`datetime.datetime`.
|
|
||||||
|
|
||||||
An ISO-8601 datetime string consists of a date portion, followed
|
|
||||||
optionally by a time portion - the date and time portions are separated
|
|
||||||
by a single character separator, which is ``T`` in the official
|
|
||||||
standard. Incomplete date formats (such as ``YYYY-MM``) may *not* be
|
|
||||||
combined with a time portion.
|
|
||||||
|
|
||||||
Supported date formats are:
|
|
||||||
|
|
||||||
Common:
|
|
||||||
|
|
||||||
- ``YYYY``
|
|
||||||
- ``YYYY-MM``
|
|
||||||
- ``YYYY-MM-DD`` or ``YYYYMMDD``
|
|
||||||
|
|
||||||
Uncommon:
|
|
||||||
|
|
||||||
- ``YYYY-Www`` or ``YYYYWww`` - ISO week (day defaults to 0)
|
|
||||||
- ``YYYY-Www-D`` or ``YYYYWwwD`` - ISO week and day
|
|
||||||
|
|
||||||
The ISO week and day numbering follows the same logic as
|
|
||||||
:func:`datetime.date.isocalendar`.
|
|
||||||
|
|
||||||
Supported time formats are:
|
|
||||||
|
|
||||||
- ``hh``
|
|
||||||
- ``hh:mm`` or ``hhmm``
|
|
||||||
- ``hh:mm:ss`` or ``hhmmss``
|
|
||||||
- ``hh:mm:ss.ssssss`` (Up to 6 sub-second digits)
|
|
||||||
|
|
||||||
Midnight is a special case for `hh`, as the standard supports both
|
|
||||||
00:00 and 24:00 as a representation. The decimal separator can be
|
|
||||||
either a dot or a comma.
|
|
||||||
|
|
||||||
|
|
||||||
.. caution::
|
|
||||||
|
|
||||||
Support for fractional components other than seconds is part of the
|
|
||||||
ISO-8601 standard, but is not currently implemented in this parser.
|
|
||||||
|
|
||||||
Supported time zone offset formats are:
|
|
||||||
|
|
||||||
- `Z` (UTC)
|
|
||||||
- `±HH:MM`
|
|
||||||
- `±HHMM`
|
|
||||||
- `±HH`
|
|
||||||
|
|
||||||
Offsets will be represented as :class:`dateutil.tz.tzoffset` objects,
|
|
||||||
with the exception of UTC, which will be represented as
|
|
||||||
:class:`dateutil.tz.tzutc`. Time zone offsets equivalent to UTC (such
|
|
||||||
as `+00:00`) will also be represented as :class:`dateutil.tz.tzutc`.
|
|
||||||
|
|
||||||
:param dt_str:
|
|
||||||
A string or stream containing only an ISO-8601 datetime string
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`datetime.datetime` representing the string.
|
|
||||||
Unspecified components default to their lowest value.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
As of version 2.7.0, the strictness of the parser should not be
|
|
||||||
considered a stable part of the contract. Any valid ISO-8601 string
|
|
||||||
that parses correctly with the default settings will continue to
|
|
||||||
parse correctly in future versions, but invalid strings that
|
|
||||||
currently fail (e.g. ``2017-01-01T00:00+00:00:00``) are not
|
|
||||||
guaranteed to continue failing in future versions if they encode
|
|
||||||
a valid date.
|
|
||||||
|
|
||||||
.. versionadded:: 2.7.0
|
|
||||||
"""
|
|
||||||
components, pos = self._parse_isodate(dt_str)
|
|
||||||
|
|
||||||
if len(dt_str) > pos:
|
|
||||||
if self._sep is None or dt_str[pos:pos + 1] == self._sep:
|
|
||||||
components += self._parse_isotime(dt_str[pos + 1:])
|
|
||||||
else:
|
|
||||||
raise ValueError('String contains unknown ISO components')
|
|
||||||
|
|
||||||
if len(components) > 3 and components[3] == 24:
|
|
||||||
components[3] = 0
|
|
||||||
return datetime(*components) + timedelta(days=1)
|
|
||||||
|
|
||||||
return datetime(*components)
|
|
||||||
|
|
||||||
@_takes_ascii
|
|
||||||
def parse_isodate(self, datestr):
|
|
||||||
"""
|
|
||||||
Parse the date portion of an ISO string.
|
|
||||||
|
|
||||||
:param datestr:
|
|
||||||
The string portion of an ISO string, without a separator
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`datetime.date` object
|
|
||||||
"""
|
|
||||||
components, pos = self._parse_isodate(datestr)
|
|
||||||
if pos < len(datestr):
|
|
||||||
raise ValueError('String contains unknown ISO ' +
|
|
||||||
'components: {!r}'.format(datestr.decode('ascii')))
|
|
||||||
return date(*components)
|
|
||||||
|
|
||||||
@_takes_ascii
|
|
||||||
def parse_isotime(self, timestr):
|
|
||||||
"""
|
|
||||||
Parse the time portion of an ISO string.
|
|
||||||
|
|
||||||
:param timestr:
|
|
||||||
The time portion of an ISO string, without a separator
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`datetime.time` object
|
|
||||||
"""
|
|
||||||
components = self._parse_isotime(timestr)
|
|
||||||
if components[0] == 24:
|
|
||||||
components[0] = 0
|
|
||||||
return time(*components)
|
|
||||||
|
|
||||||
@_takes_ascii
|
|
||||||
def parse_tzstr(self, tzstr, zero_as_utc=True):
|
|
||||||
"""
|
|
||||||
Parse a valid ISO time zone string.
|
|
||||||
|
|
||||||
See :func:`isoparser.isoparse` for details on supported formats.
|
|
||||||
|
|
||||||
:param tzstr:
|
|
||||||
A string representing an ISO time zone offset
|
|
||||||
|
|
||||||
:param zero_as_utc:
|
|
||||||
Whether to return :class:`dateutil.tz.tzutc` for zero-offset zones
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns :class:`dateutil.tz.tzoffset` for offsets and
|
|
||||||
:class:`dateutil.tz.tzutc` for ``Z`` and (if ``zero_as_utc`` is
|
|
||||||
specified) offsets equivalent to UTC.
|
|
||||||
"""
|
|
||||||
return self._parse_tzstr(tzstr, zero_as_utc=zero_as_utc)
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
_DATE_SEP = b'-'
|
|
||||||
_TIME_SEP = b':'
|
|
||||||
_FRACTION_REGEX = re.compile(b'[\\.,]([0-9]+)')
|
|
||||||
|
|
||||||
def _parse_isodate(self, dt_str):
|
|
||||||
try:
|
|
||||||
return self._parse_isodate_common(dt_str)
|
|
||||||
except ValueError:
|
|
||||||
return self._parse_isodate_uncommon(dt_str)
|
|
||||||
|
|
||||||
def _parse_isodate_common(self, dt_str):
|
|
||||||
len_str = len(dt_str)
|
|
||||||
components = [1, 1, 1]
|
|
||||||
|
|
||||||
if len_str < 4:
|
|
||||||
raise ValueError('ISO string too short')
|
|
||||||
|
|
||||||
# Year
|
|
||||||
components[0] = int(dt_str[0:4])
|
|
||||||
pos = 4
|
|
||||||
if pos >= len_str:
|
|
||||||
return components, pos
|
|
||||||
|
|
||||||
has_sep = dt_str[pos:pos + 1] == self._DATE_SEP
|
|
||||||
if has_sep:
|
|
||||||
pos += 1
|
|
||||||
|
|
||||||
# Month
|
|
||||||
if len_str - pos < 2:
|
|
||||||
raise ValueError('Invalid common month')
|
|
||||||
|
|
||||||
components[1] = int(dt_str[pos:pos + 2])
|
|
||||||
pos += 2
|
|
||||||
|
|
||||||
if pos >= len_str:
|
|
||||||
if has_sep:
|
|
||||||
return components, pos
|
|
||||||
else:
|
|
||||||
raise ValueError('Invalid ISO format')
|
|
||||||
|
|
||||||
if has_sep:
|
|
||||||
if dt_str[pos:pos + 1] != self._DATE_SEP:
|
|
||||||
raise ValueError('Invalid separator in ISO string')
|
|
||||||
pos += 1
|
|
||||||
|
|
||||||
# Day
|
|
||||||
if len_str - pos < 2:
|
|
||||||
raise ValueError('Invalid common day')
|
|
||||||
components[2] = int(dt_str[pos:pos + 2])
|
|
||||||
return components, pos + 2
|
|
||||||
|
|
||||||
def _parse_isodate_uncommon(self, dt_str):
|
|
||||||
if len(dt_str) < 4:
|
|
||||||
raise ValueError('ISO string too short')
|
|
||||||
|
|
||||||
# All ISO formats start with the year
|
|
||||||
year = int(dt_str[0:4])
|
|
||||||
|
|
||||||
has_sep = dt_str[4:5] == self._DATE_SEP
|
|
||||||
|
|
||||||
pos = 4 + has_sep # Skip '-' if it's there
|
|
||||||
if dt_str[pos:pos + 1] == b'W':
|
|
||||||
# YYYY-?Www-?D?
|
|
||||||
pos += 1
|
|
||||||
weekno = int(dt_str[pos:pos + 2])
|
|
||||||
pos += 2
|
|
||||||
|
|
||||||
dayno = 1
|
|
||||||
if len(dt_str) > pos:
|
|
||||||
if (dt_str[pos:pos + 1] == self._DATE_SEP) != has_sep:
|
|
||||||
raise ValueError('Inconsistent use of dash separator')
|
|
||||||
|
|
||||||
pos += has_sep
|
|
||||||
|
|
||||||
dayno = int(dt_str[pos:pos + 1])
|
|
||||||
pos += 1
|
|
||||||
|
|
||||||
base_date = self._calculate_weekdate(year, weekno, dayno)
|
|
||||||
else:
|
|
||||||
# YYYYDDD or YYYY-DDD
|
|
||||||
if len(dt_str) - pos < 3:
|
|
||||||
raise ValueError('Invalid ordinal day')
|
|
||||||
|
|
||||||
ordinal_day = int(dt_str[pos:pos + 3])
|
|
||||||
pos += 3
|
|
||||||
|
|
||||||
if ordinal_day < 1 or ordinal_day > (365 + calendar.isleap(year)):
|
|
||||||
raise ValueError('Invalid ordinal day' +
|
|
||||||
' {} for year {}'.format(ordinal_day, year))
|
|
||||||
|
|
||||||
base_date = date(year, 1, 1) + timedelta(days=ordinal_day - 1)
|
|
||||||
|
|
||||||
components = [base_date.year, base_date.month, base_date.day]
|
|
||||||
return components, pos
|
|
||||||
|
|
||||||
def _calculate_weekdate(self, year, week, day):
|
|
||||||
"""
|
|
||||||
Calculate the day of corresponding to the ISO year-week-day calendar.
|
|
||||||
|
|
||||||
This function is effectively the inverse of
|
|
||||||
:func:`datetime.date.isocalendar`.
|
|
||||||
|
|
||||||
:param year:
|
|
||||||
The year in the ISO calendar
|
|
||||||
|
|
||||||
:param week:
|
|
||||||
The week in the ISO calendar - range is [1, 53]
|
|
||||||
|
|
||||||
:param day:
|
|
||||||
The day in the ISO calendar - range is [1 (MON), 7 (SUN)]
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`datetime.date`
|
|
||||||
"""
|
|
||||||
if not 0 < week < 54:
|
|
||||||
raise ValueError('Invalid week: {}'.format(week))
|
|
||||||
|
|
||||||
if not 0 < day < 8: # Range is 1-7
|
|
||||||
raise ValueError('Invalid weekday: {}'.format(day))
|
|
||||||
|
|
||||||
# Get week 1 for the specific year:
|
|
||||||
jan_4 = date(year, 1, 4) # Week 1 always has January 4th in it
|
|
||||||
week_1 = jan_4 - timedelta(days=jan_4.isocalendar()[2] - 1)
|
|
||||||
|
|
||||||
# Now add the specific number of weeks and days to get what we want
|
|
||||||
week_offset = (week - 1) * 7 + (day - 1)
|
|
||||||
return week_1 + timedelta(days=week_offset)
|
|
||||||
|
|
||||||
def _parse_isotime(self, timestr):
|
|
||||||
len_str = len(timestr)
|
|
||||||
components = [0, 0, 0, 0, None]
|
|
||||||
pos = 0
|
|
||||||
comp = -1
|
|
||||||
|
|
||||||
if len_str < 2:
|
|
||||||
raise ValueError('ISO time too short')
|
|
||||||
|
|
||||||
has_sep = False
|
|
||||||
|
|
||||||
while pos < len_str and comp < 5:
|
|
||||||
comp += 1
|
|
||||||
|
|
||||||
if timestr[pos:pos + 1] in b'-+Zz':
|
|
||||||
# Detect time zone boundary
|
|
||||||
components[-1] = self._parse_tzstr(timestr[pos:])
|
|
||||||
pos = len_str
|
|
||||||
break
|
|
||||||
|
|
||||||
if comp == 1 and timestr[pos:pos+1] == self._TIME_SEP:
|
|
||||||
has_sep = True
|
|
||||||
pos += 1
|
|
||||||
elif comp == 2 and has_sep:
|
|
||||||
if timestr[pos:pos+1] != self._TIME_SEP:
|
|
||||||
raise ValueError('Inconsistent use of colon separator')
|
|
||||||
pos += 1
|
|
||||||
|
|
||||||
if comp < 3:
|
|
||||||
# Hour, minute, second
|
|
||||||
components[comp] = int(timestr[pos:pos + 2])
|
|
||||||
pos += 2
|
|
||||||
|
|
||||||
if comp == 3:
|
|
||||||
# Fraction of a second
|
|
||||||
frac = self._FRACTION_REGEX.match(timestr[pos:])
|
|
||||||
if not frac:
|
|
||||||
continue
|
|
||||||
|
|
||||||
us_str = frac.group(1)[:6] # Truncate to microseconds
|
|
||||||
components[comp] = int(us_str) * 10**(6 - len(us_str))
|
|
||||||
pos += len(frac.group())
|
|
||||||
|
|
||||||
if pos < len_str:
|
|
||||||
raise ValueError('Unused components in ISO string')
|
|
||||||
|
|
||||||
if components[0] == 24:
|
|
||||||
# Standard supports 00:00 and 24:00 as representations of midnight
|
|
||||||
if any(component != 0 for component in components[1:4]):
|
|
||||||
raise ValueError('Hour may only be 24 at 24:00:00.000')
|
|
||||||
|
|
||||||
return components
|
|
||||||
|
|
||||||
def _parse_tzstr(self, tzstr, zero_as_utc=True):
|
|
||||||
if tzstr == b'Z' or tzstr == b'z':
|
|
||||||
return tz.UTC
|
|
||||||
|
|
||||||
if len(tzstr) not in {3, 5, 6}:
|
|
||||||
raise ValueError('Time zone offset must be 1, 3, 5 or 6 characters')
|
|
||||||
|
|
||||||
if tzstr[0:1] == b'-':
|
|
||||||
mult = -1
|
|
||||||
elif tzstr[0:1] == b'+':
|
|
||||||
mult = 1
|
|
||||||
else:
|
|
||||||
raise ValueError('Time zone offset requires sign')
|
|
||||||
|
|
||||||
hours = int(tzstr[1:3])
|
|
||||||
if len(tzstr) == 3:
|
|
||||||
minutes = 0
|
|
||||||
else:
|
|
||||||
minutes = int(tzstr[(4 if tzstr[3:4] == self._TIME_SEP else 3):])
|
|
||||||
|
|
||||||
if zero_as_utc and hours == 0 and minutes == 0:
|
|
||||||
return tz.UTC
|
|
||||||
else:
|
|
||||||
if minutes > 59:
|
|
||||||
raise ValueError('Invalid minutes in time zone offset')
|
|
||||||
|
|
||||||
if hours > 23:
|
|
||||||
raise ValueError('Invalid hours in time zone offset')
|
|
||||||
|
|
||||||
return tz.tzoffset(None, mult * (hours * 60 + minutes) * 60)
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_ISOPARSER = isoparser()
|
|
||||||
isoparse = DEFAULT_ISOPARSER.isoparse
|
|
||||||
@ -1,599 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
|
|
||||||
import operator
|
|
||||||
from math import copysign
|
|
||||||
|
|
||||||
from six import integer_types
|
|
||||||
from warnings import warn
|
|
||||||
|
|
||||||
from ._common import weekday
|
|
||||||
|
|
||||||
MO, TU, WE, TH, FR, SA, SU = weekdays = tuple(weekday(x) for x in range(7))
|
|
||||||
|
|
||||||
__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"]
|
|
||||||
|
|
||||||
|
|
||||||
class relativedelta(object):
|
|
||||||
"""
|
|
||||||
The relativedelta type is designed to be applied to an existing datetime and
|
|
||||||
can replace specific components of that datetime, or represents an interval
|
|
||||||
of time.
|
|
||||||
|
|
||||||
It is based on the specification of the excellent work done by M.-A. Lemburg
|
|
||||||
in his
|
|
||||||
`mx.DateTime <https://www.egenix.com/products/python/mxBase/mxDateTime/>`_ extension.
|
|
||||||
However, notice that this type does *NOT* implement the same algorithm as
|
|
||||||
his work. Do *NOT* expect it to behave like mx.DateTime's counterpart.
|
|
||||||
|
|
||||||
There are two different ways to build a relativedelta instance. The
|
|
||||||
first one is passing it two date/datetime classes::
|
|
||||||
|
|
||||||
relativedelta(datetime1, datetime2)
|
|
||||||
|
|
||||||
The second one is passing it any number of the following keyword arguments::
|
|
||||||
|
|
||||||
relativedelta(arg1=x,arg2=y,arg3=z...)
|
|
||||||
|
|
||||||
year, month, day, hour, minute, second, microsecond:
|
|
||||||
Absolute information (argument is singular); adding or subtracting a
|
|
||||||
relativedelta with absolute information does not perform an arithmetic
|
|
||||||
operation, but rather REPLACES the corresponding value in the
|
|
||||||
original datetime with the value(s) in relativedelta.
|
|
||||||
|
|
||||||
years, months, weeks, days, hours, minutes, seconds, microseconds:
|
|
||||||
Relative information, may be negative (argument is plural); adding
|
|
||||||
or subtracting a relativedelta with relative information performs
|
|
||||||
the corresponding arithmetic operation on the original datetime value
|
|
||||||
with the information in the relativedelta.
|
|
||||||
|
|
||||||
weekday:
|
|
||||||
One of the weekday instances (MO, TU, etc) available in the
|
|
||||||
relativedelta module. These instances may receive a parameter N,
|
|
||||||
specifying the Nth weekday, which could be positive or negative
|
|
||||||
(like MO(+1) or MO(-2)). Not specifying it is the same as specifying
|
|
||||||
+1. You can also use an integer, where 0=MO. This argument is always
|
|
||||||
relative e.g. if the calculated date is already Monday, using MO(1)
|
|
||||||
or MO(-1) won't change the day. To effectively make it absolute, use
|
|
||||||
it in combination with the day argument (e.g. day=1, MO(1) for first
|
|
||||||
Monday of the month).
|
|
||||||
|
|
||||||
leapdays:
|
|
||||||
Will add given days to the date found, if year is a leap
|
|
||||||
year, and the date found is post 28 of february.
|
|
||||||
|
|
||||||
yearday, nlyearday:
|
|
||||||
Set the yearday or the non-leap year day (jump leap days).
|
|
||||||
These are converted to day/month/leapdays information.
|
|
||||||
|
|
||||||
There are relative and absolute forms of the keyword
|
|
||||||
arguments. The plural is relative, and the singular is
|
|
||||||
absolute. For each argument in the order below, the absolute form
|
|
||||||
is applied first (by setting each attribute to that value) and
|
|
||||||
then the relative form (by adding the value to the attribute).
|
|
||||||
|
|
||||||
The order of attributes considered when this relativedelta is
|
|
||||||
added to a datetime is:
|
|
||||||
|
|
||||||
1. Year
|
|
||||||
2. Month
|
|
||||||
3. Day
|
|
||||||
4. Hours
|
|
||||||
5. Minutes
|
|
||||||
6. Seconds
|
|
||||||
7. Microseconds
|
|
||||||
|
|
||||||
Finally, weekday is applied, using the rule described above.
|
|
||||||
|
|
||||||
For example
|
|
||||||
|
|
||||||
>>> from datetime import datetime
|
|
||||||
>>> from dateutil.relativedelta import relativedelta, MO
|
|
||||||
>>> dt = datetime(2018, 4, 9, 13, 37, 0)
|
|
||||||
>>> delta = relativedelta(hours=25, day=1, weekday=MO(1))
|
|
||||||
>>> dt + delta
|
|
||||||
datetime.datetime(2018, 4, 2, 14, 37)
|
|
||||||
|
|
||||||
First, the day is set to 1 (the first of the month), then 25 hours
|
|
||||||
are added, to get to the 2nd day and 14th hour, finally the
|
|
||||||
weekday is applied, but since the 2nd is already a Monday there is
|
|
||||||
no effect.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, dt1=None, dt2=None,
|
|
||||||
years=0, months=0, days=0, leapdays=0, weeks=0,
|
|
||||||
hours=0, minutes=0, seconds=0, microseconds=0,
|
|
||||||
year=None, month=None, day=None, weekday=None,
|
|
||||||
yearday=None, nlyearday=None,
|
|
||||||
hour=None, minute=None, second=None, microsecond=None):
|
|
||||||
|
|
||||||
if dt1 and dt2:
|
|
||||||
# datetime is a subclass of date. So both must be date
|
|
||||||
if not (isinstance(dt1, datetime.date) and
|
|
||||||
isinstance(dt2, datetime.date)):
|
|
||||||
raise TypeError("relativedelta only diffs datetime/date")
|
|
||||||
|
|
||||||
# We allow two dates, or two datetimes, so we coerce them to be
|
|
||||||
# of the same type
|
|
||||||
if (isinstance(dt1, datetime.datetime) !=
|
|
||||||
isinstance(dt2, datetime.datetime)):
|
|
||||||
if not isinstance(dt1, datetime.datetime):
|
|
||||||
dt1 = datetime.datetime.fromordinal(dt1.toordinal())
|
|
||||||
elif not isinstance(dt2, datetime.datetime):
|
|
||||||
dt2 = datetime.datetime.fromordinal(dt2.toordinal())
|
|
||||||
|
|
||||||
self.years = 0
|
|
||||||
self.months = 0
|
|
||||||
self.days = 0
|
|
||||||
self.leapdays = 0
|
|
||||||
self.hours = 0
|
|
||||||
self.minutes = 0
|
|
||||||
self.seconds = 0
|
|
||||||
self.microseconds = 0
|
|
||||||
self.year = None
|
|
||||||
self.month = None
|
|
||||||
self.day = None
|
|
||||||
self.weekday = None
|
|
||||||
self.hour = None
|
|
||||||
self.minute = None
|
|
||||||
self.second = None
|
|
||||||
self.microsecond = None
|
|
||||||
self._has_time = 0
|
|
||||||
|
|
||||||
# Get year / month delta between the two
|
|
||||||
months = (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month)
|
|
||||||
self._set_months(months)
|
|
||||||
|
|
||||||
# Remove the year/month delta so the timedelta is just well-defined
|
|
||||||
# time units (seconds, days and microseconds)
|
|
||||||
dtm = self.__radd__(dt2)
|
|
||||||
|
|
||||||
# If we've overshot our target, make an adjustment
|
|
||||||
if dt1 < dt2:
|
|
||||||
compare = operator.gt
|
|
||||||
increment = 1
|
|
||||||
else:
|
|
||||||
compare = operator.lt
|
|
||||||
increment = -1
|
|
||||||
|
|
||||||
while compare(dt1, dtm):
|
|
||||||
months += increment
|
|
||||||
self._set_months(months)
|
|
||||||
dtm = self.__radd__(dt2)
|
|
||||||
|
|
||||||
# Get the timedelta between the "months-adjusted" date and dt1
|
|
||||||
delta = dt1 - dtm
|
|
||||||
self.seconds = delta.seconds + delta.days * 86400
|
|
||||||
self.microseconds = delta.microseconds
|
|
||||||
else:
|
|
||||||
# Check for non-integer values in integer-only quantities
|
|
||||||
if any(x is not None and x != int(x) for x in (years, months)):
|
|
||||||
raise ValueError("Non-integer years and months are "
|
|
||||||
"ambiguous and not currently supported.")
|
|
||||||
|
|
||||||
# Relative information
|
|
||||||
self.years = int(years)
|
|
||||||
self.months = int(months)
|
|
||||||
self.days = days + weeks * 7
|
|
||||||
self.leapdays = leapdays
|
|
||||||
self.hours = hours
|
|
||||||
self.minutes = minutes
|
|
||||||
self.seconds = seconds
|
|
||||||
self.microseconds = microseconds
|
|
||||||
|
|
||||||
# Absolute information
|
|
||||||
self.year = year
|
|
||||||
self.month = month
|
|
||||||
self.day = day
|
|
||||||
self.hour = hour
|
|
||||||
self.minute = minute
|
|
||||||
self.second = second
|
|
||||||
self.microsecond = microsecond
|
|
||||||
|
|
||||||
if any(x is not None and int(x) != x
|
|
||||||
for x in (year, month, day, hour,
|
|
||||||
minute, second, microsecond)):
|
|
||||||
# For now we'll deprecate floats - later it'll be an error.
|
|
||||||
warn("Non-integer value passed as absolute information. " +
|
|
||||||
"This is not a well-defined condition and will raise " +
|
|
||||||
"errors in future versions.", DeprecationWarning)
|
|
||||||
|
|
||||||
if isinstance(weekday, integer_types):
|
|
||||||
self.weekday = weekdays[weekday]
|
|
||||||
else:
|
|
||||||
self.weekday = weekday
|
|
||||||
|
|
||||||
yday = 0
|
|
||||||
if nlyearday:
|
|
||||||
yday = nlyearday
|
|
||||||
elif yearday:
|
|
||||||
yday = yearday
|
|
||||||
if yearday > 59:
|
|
||||||
self.leapdays = -1
|
|
||||||
if yday:
|
|
||||||
ydayidx = [31, 59, 90, 120, 151, 181, 212,
|
|
||||||
243, 273, 304, 334, 366]
|
|
||||||
for idx, ydays in enumerate(ydayidx):
|
|
||||||
if yday <= ydays:
|
|
||||||
self.month = idx+1
|
|
||||||
if idx == 0:
|
|
||||||
self.day = yday
|
|
||||||
else:
|
|
||||||
self.day = yday-ydayidx[idx-1]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise ValueError("invalid year day (%d)" % yday)
|
|
||||||
|
|
||||||
self._fix()
|
|
||||||
|
|
||||||
def _fix(self):
|
|
||||||
if abs(self.microseconds) > 999999:
|
|
||||||
s = _sign(self.microseconds)
|
|
||||||
div, mod = divmod(self.microseconds * s, 1000000)
|
|
||||||
self.microseconds = mod * s
|
|
||||||
self.seconds += div * s
|
|
||||||
if abs(self.seconds) > 59:
|
|
||||||
s = _sign(self.seconds)
|
|
||||||
div, mod = divmod(self.seconds * s, 60)
|
|
||||||
self.seconds = mod * s
|
|
||||||
self.minutes += div * s
|
|
||||||
if abs(self.minutes) > 59:
|
|
||||||
s = _sign(self.minutes)
|
|
||||||
div, mod = divmod(self.minutes * s, 60)
|
|
||||||
self.minutes = mod * s
|
|
||||||
self.hours += div * s
|
|
||||||
if abs(self.hours) > 23:
|
|
||||||
s = _sign(self.hours)
|
|
||||||
div, mod = divmod(self.hours * s, 24)
|
|
||||||
self.hours = mod * s
|
|
||||||
self.days += div * s
|
|
||||||
if abs(self.months) > 11:
|
|
||||||
s = _sign(self.months)
|
|
||||||
div, mod = divmod(self.months * s, 12)
|
|
||||||
self.months = mod * s
|
|
||||||
self.years += div * s
|
|
||||||
if (self.hours or self.minutes or self.seconds or self.microseconds
|
|
||||||
or self.hour is not None or self.minute is not None or
|
|
||||||
self.second is not None or self.microsecond is not None):
|
|
||||||
self._has_time = 1
|
|
||||||
else:
|
|
||||||
self._has_time = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def weeks(self):
|
|
||||||
return int(self.days / 7.0)
|
|
||||||
|
|
||||||
@weeks.setter
|
|
||||||
def weeks(self, value):
|
|
||||||
self.days = self.days - (self.weeks * 7) + value * 7
|
|
||||||
|
|
||||||
def _set_months(self, months):
|
|
||||||
self.months = months
|
|
||||||
if abs(self.months) > 11:
|
|
||||||
s = _sign(self.months)
|
|
||||||
div, mod = divmod(self.months * s, 12)
|
|
||||||
self.months = mod * s
|
|
||||||
self.years = div * s
|
|
||||||
else:
|
|
||||||
self.years = 0
|
|
||||||
|
|
||||||
def normalized(self):
|
|
||||||
"""
|
|
||||||
Return a version of this object represented entirely using integer
|
|
||||||
values for the relative attributes.
|
|
||||||
|
|
||||||
>>> relativedelta(days=1.5, hours=2).normalized()
|
|
||||||
relativedelta(days=+1, hours=+14)
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`dateutil.relativedelta.relativedelta` object.
|
|
||||||
"""
|
|
||||||
# Cascade remainders down (rounding each to roughly nearest microsecond)
|
|
||||||
days = int(self.days)
|
|
||||||
|
|
||||||
hours_f = round(self.hours + 24 * (self.days - days), 11)
|
|
||||||
hours = int(hours_f)
|
|
||||||
|
|
||||||
minutes_f = round(self.minutes + 60 * (hours_f - hours), 10)
|
|
||||||
minutes = int(minutes_f)
|
|
||||||
|
|
||||||
seconds_f = round(self.seconds + 60 * (minutes_f - minutes), 8)
|
|
||||||
seconds = int(seconds_f)
|
|
||||||
|
|
||||||
microseconds = round(self.microseconds + 1e6 * (seconds_f - seconds))
|
|
||||||
|
|
||||||
# Constructor carries overflow back up with call to _fix()
|
|
||||||
return self.__class__(years=self.years, months=self.months,
|
|
||||||
days=days, hours=hours, minutes=minutes,
|
|
||||||
seconds=seconds, microseconds=microseconds,
|
|
||||||
leapdays=self.leapdays, year=self.year,
|
|
||||||
month=self.month, day=self.day,
|
|
||||||
weekday=self.weekday, hour=self.hour,
|
|
||||||
minute=self.minute, second=self.second,
|
|
||||||
microsecond=self.microsecond)
|
|
||||||
|
|
||||||
def __add__(self, other):
|
|
||||||
if isinstance(other, relativedelta):
|
|
||||||
return self.__class__(years=other.years + self.years,
|
|
||||||
months=other.months + self.months,
|
|
||||||
days=other.days + self.days,
|
|
||||||
hours=other.hours + self.hours,
|
|
||||||
minutes=other.minutes + self.minutes,
|
|
||||||
seconds=other.seconds + self.seconds,
|
|
||||||
microseconds=(other.microseconds +
|
|
||||||
self.microseconds),
|
|
||||||
leapdays=other.leapdays or self.leapdays,
|
|
||||||
year=(other.year if other.year is not None
|
|
||||||
else self.year),
|
|
||||||
month=(other.month if other.month is not None
|
|
||||||
else self.month),
|
|
||||||
day=(other.day if other.day is not None
|
|
||||||
else self.day),
|
|
||||||
weekday=(other.weekday if other.weekday is not None
|
|
||||||
else self.weekday),
|
|
||||||
hour=(other.hour if other.hour is not None
|
|
||||||
else self.hour),
|
|
||||||
minute=(other.minute if other.minute is not None
|
|
||||||
else self.minute),
|
|
||||||
second=(other.second if other.second is not None
|
|
||||||
else self.second),
|
|
||||||
microsecond=(other.microsecond if other.microsecond
|
|
||||||
is not None else
|
|
||||||
self.microsecond))
|
|
||||||
if isinstance(other, datetime.timedelta):
|
|
||||||
return self.__class__(years=self.years,
|
|
||||||
months=self.months,
|
|
||||||
days=self.days + other.days,
|
|
||||||
hours=self.hours,
|
|
||||||
minutes=self.minutes,
|
|
||||||
seconds=self.seconds + other.seconds,
|
|
||||||
microseconds=self.microseconds + other.microseconds,
|
|
||||||
leapdays=self.leapdays,
|
|
||||||
year=self.year,
|
|
||||||
month=self.month,
|
|
||||||
day=self.day,
|
|
||||||
weekday=self.weekday,
|
|
||||||
hour=self.hour,
|
|
||||||
minute=self.minute,
|
|
||||||
second=self.second,
|
|
||||||
microsecond=self.microsecond)
|
|
||||||
if not isinstance(other, datetime.date):
|
|
||||||
return NotImplemented
|
|
||||||
elif self._has_time and not isinstance(other, datetime.datetime):
|
|
||||||
other = datetime.datetime.fromordinal(other.toordinal())
|
|
||||||
year = (self.year or other.year)+self.years
|
|
||||||
month = self.month or other.month
|
|
||||||
if self.months:
|
|
||||||
assert 1 <= abs(self.months) <= 12
|
|
||||||
month += self.months
|
|
||||||
if month > 12:
|
|
||||||
year += 1
|
|
||||||
month -= 12
|
|
||||||
elif month < 1:
|
|
||||||
year -= 1
|
|
||||||
month += 12
|
|
||||||
day = min(calendar.monthrange(year, month)[1],
|
|
||||||
self.day or other.day)
|
|
||||||
repl = {"year": year, "month": month, "day": day}
|
|
||||||
for attr in ["hour", "minute", "second", "microsecond"]:
|
|
||||||
value = getattr(self, attr)
|
|
||||||
if value is not None:
|
|
||||||
repl[attr] = value
|
|
||||||
days = self.days
|
|
||||||
if self.leapdays and month > 2 and calendar.isleap(year):
|
|
||||||
days += self.leapdays
|
|
||||||
ret = (other.replace(**repl)
|
|
||||||
+ datetime.timedelta(days=days,
|
|
||||||
hours=self.hours,
|
|
||||||
minutes=self.minutes,
|
|
||||||
seconds=self.seconds,
|
|
||||||
microseconds=self.microseconds))
|
|
||||||
if self.weekday:
|
|
||||||
weekday, nth = self.weekday.weekday, self.weekday.n or 1
|
|
||||||
jumpdays = (abs(nth) - 1) * 7
|
|
||||||
if nth > 0:
|
|
||||||
jumpdays += (7 - ret.weekday() + weekday) % 7
|
|
||||||
else:
|
|
||||||
jumpdays += (ret.weekday() - weekday) % 7
|
|
||||||
jumpdays *= -1
|
|
||||||
ret += datetime.timedelta(days=jumpdays)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def __radd__(self, other):
|
|
||||||
return self.__add__(other)
|
|
||||||
|
|
||||||
def __rsub__(self, other):
|
|
||||||
return self.__neg__().__radd__(other)
|
|
||||||
|
|
||||||
def __sub__(self, other):
|
|
||||||
if not isinstance(other, relativedelta):
|
|
||||||
return NotImplemented # In case the other object defines __rsub__
|
|
||||||
return self.__class__(years=self.years - other.years,
|
|
||||||
months=self.months - other.months,
|
|
||||||
days=self.days - other.days,
|
|
||||||
hours=self.hours - other.hours,
|
|
||||||
minutes=self.minutes - other.minutes,
|
|
||||||
seconds=self.seconds - other.seconds,
|
|
||||||
microseconds=self.microseconds - other.microseconds,
|
|
||||||
leapdays=self.leapdays or other.leapdays,
|
|
||||||
year=(self.year if self.year is not None
|
|
||||||
else other.year),
|
|
||||||
month=(self.month if self.month is not None else
|
|
||||||
other.month),
|
|
||||||
day=(self.day if self.day is not None else
|
|
||||||
other.day),
|
|
||||||
weekday=(self.weekday if self.weekday is not None else
|
|
||||||
other.weekday),
|
|
||||||
hour=(self.hour if self.hour is not None else
|
|
||||||
other.hour),
|
|
||||||
minute=(self.minute if self.minute is not None else
|
|
||||||
other.minute),
|
|
||||||
second=(self.second if self.second is not None else
|
|
||||||
other.second),
|
|
||||||
microsecond=(self.microsecond if self.microsecond
|
|
||||||
is not None else
|
|
||||||
other.microsecond))
|
|
||||||
|
|
||||||
def __abs__(self):
|
|
||||||
return self.__class__(years=abs(self.years),
|
|
||||||
months=abs(self.months),
|
|
||||||
days=abs(self.days),
|
|
||||||
hours=abs(self.hours),
|
|
||||||
minutes=abs(self.minutes),
|
|
||||||
seconds=abs(self.seconds),
|
|
||||||
microseconds=abs(self.microseconds),
|
|
||||||
leapdays=self.leapdays,
|
|
||||||
year=self.year,
|
|
||||||
month=self.month,
|
|
||||||
day=self.day,
|
|
||||||
weekday=self.weekday,
|
|
||||||
hour=self.hour,
|
|
||||||
minute=self.minute,
|
|
||||||
second=self.second,
|
|
||||||
microsecond=self.microsecond)
|
|
||||||
|
|
||||||
def __neg__(self):
|
|
||||||
return self.__class__(years=-self.years,
|
|
||||||
months=-self.months,
|
|
||||||
days=-self.days,
|
|
||||||
hours=-self.hours,
|
|
||||||
minutes=-self.minutes,
|
|
||||||
seconds=-self.seconds,
|
|
||||||
microseconds=-self.microseconds,
|
|
||||||
leapdays=self.leapdays,
|
|
||||||
year=self.year,
|
|
||||||
month=self.month,
|
|
||||||
day=self.day,
|
|
||||||
weekday=self.weekday,
|
|
||||||
hour=self.hour,
|
|
||||||
minute=self.minute,
|
|
||||||
second=self.second,
|
|
||||||
microsecond=self.microsecond)
|
|
||||||
|
|
||||||
def __bool__(self):
|
|
||||||
return not (not self.years and
|
|
||||||
not self.months and
|
|
||||||
not self.days and
|
|
||||||
not self.hours and
|
|
||||||
not self.minutes and
|
|
||||||
not self.seconds and
|
|
||||||
not self.microseconds and
|
|
||||||
not self.leapdays and
|
|
||||||
self.year is None and
|
|
||||||
self.month is None and
|
|
||||||
self.day is None and
|
|
||||||
self.weekday is None and
|
|
||||||
self.hour is None and
|
|
||||||
self.minute is None and
|
|
||||||
self.second is None and
|
|
||||||
self.microsecond is None)
|
|
||||||
# Compatibility with Python 2.x
|
|
||||||
__nonzero__ = __bool__
|
|
||||||
|
|
||||||
def __mul__(self, other):
|
|
||||||
try:
|
|
||||||
f = float(other)
|
|
||||||
except TypeError:
|
|
||||||
return NotImplemented
|
|
||||||
|
|
||||||
return self.__class__(years=int(self.years * f),
|
|
||||||
months=int(self.months * f),
|
|
||||||
days=int(self.days * f),
|
|
||||||
hours=int(self.hours * f),
|
|
||||||
minutes=int(self.minutes * f),
|
|
||||||
seconds=int(self.seconds * f),
|
|
||||||
microseconds=int(self.microseconds * f),
|
|
||||||
leapdays=self.leapdays,
|
|
||||||
year=self.year,
|
|
||||||
month=self.month,
|
|
||||||
day=self.day,
|
|
||||||
weekday=self.weekday,
|
|
||||||
hour=self.hour,
|
|
||||||
minute=self.minute,
|
|
||||||
second=self.second,
|
|
||||||
microsecond=self.microsecond)
|
|
||||||
|
|
||||||
__rmul__ = __mul__
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
if not isinstance(other, relativedelta):
|
|
||||||
return NotImplemented
|
|
||||||
if self.weekday or other.weekday:
|
|
||||||
if not self.weekday or not other.weekday:
|
|
||||||
return False
|
|
||||||
if self.weekday.weekday != other.weekday.weekday:
|
|
||||||
return False
|
|
||||||
n1, n2 = self.weekday.n, other.weekday.n
|
|
||||||
if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)):
|
|
||||||
return False
|
|
||||||
return (self.years == other.years and
|
|
||||||
self.months == other.months and
|
|
||||||
self.days == other.days and
|
|
||||||
self.hours == other.hours and
|
|
||||||
self.minutes == other.minutes and
|
|
||||||
self.seconds == other.seconds and
|
|
||||||
self.microseconds == other.microseconds and
|
|
||||||
self.leapdays == other.leapdays and
|
|
||||||
self.year == other.year and
|
|
||||||
self.month == other.month and
|
|
||||||
self.day == other.day and
|
|
||||||
self.hour == other.hour and
|
|
||||||
self.minute == other.minute and
|
|
||||||
self.second == other.second and
|
|
||||||
self.microsecond == other.microsecond)
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return hash((
|
|
||||||
self.weekday,
|
|
||||||
self.years,
|
|
||||||
self.months,
|
|
||||||
self.days,
|
|
||||||
self.hours,
|
|
||||||
self.minutes,
|
|
||||||
self.seconds,
|
|
||||||
self.microseconds,
|
|
||||||
self.leapdays,
|
|
||||||
self.year,
|
|
||||||
self.month,
|
|
||||||
self.day,
|
|
||||||
self.hour,
|
|
||||||
self.minute,
|
|
||||||
self.second,
|
|
||||||
self.microsecond,
|
|
||||||
))
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return not self.__eq__(other)
|
|
||||||
|
|
||||||
def __div__(self, other):
|
|
||||||
try:
|
|
||||||
reciprocal = 1 / float(other)
|
|
||||||
except TypeError:
|
|
||||||
return NotImplemented
|
|
||||||
|
|
||||||
return self.__mul__(reciprocal)
|
|
||||||
|
|
||||||
__truediv__ = __div__
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
l = []
|
|
||||||
for attr in ["years", "months", "days", "leapdays",
|
|
||||||
"hours", "minutes", "seconds", "microseconds"]:
|
|
||||||
value = getattr(self, attr)
|
|
||||||
if value:
|
|
||||||
l.append("{attr}={value:+g}".format(attr=attr, value=value))
|
|
||||||
for attr in ["year", "month", "day", "weekday",
|
|
||||||
"hour", "minute", "second", "microsecond"]:
|
|
||||||
value = getattr(self, attr)
|
|
||||||
if value is not None:
|
|
||||||
l.append("{attr}={value}".format(attr=attr, value=repr(value)))
|
|
||||||
return "{classname}({attrs})".format(classname=self.__class__.__name__,
|
|
||||||
attrs=", ".join(l))
|
|
||||||
|
|
||||||
|
|
||||||
def _sign(x):
|
|
||||||
return int(copysign(1, x))
|
|
||||||
|
|
||||||
# vim:ts=4:sw=4:et
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,12 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from .tz import *
|
|
||||||
from .tz import __doc__
|
|
||||||
|
|
||||||
__all__ = ["tzutc", "tzoffset", "tzlocal", "tzfile", "tzrange",
|
|
||||||
"tzstr", "tzical", "tzwin", "tzwinlocal", "gettz",
|
|
||||||
"enfold", "datetime_ambiguous", "datetime_exists",
|
|
||||||
"resolve_imaginary", "UTC", "DeprecatedTzFormatWarning"]
|
|
||||||
|
|
||||||
|
|
||||||
class DeprecatedTzFormatWarning(Warning):
|
|
||||||
"""Warning raised when time zones are parsed from deprecated formats."""
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,419 +0,0 @@
|
|||||||
from six import PY2
|
|
||||||
|
|
||||||
from functools import wraps
|
|
||||||
|
|
||||||
from datetime import datetime, timedelta, tzinfo
|
|
||||||
|
|
||||||
|
|
||||||
ZERO = timedelta(0)
|
|
||||||
|
|
||||||
__all__ = ['tzname_in_python2', 'enfold']
|
|
||||||
|
|
||||||
|
|
||||||
def tzname_in_python2(namefunc):
|
|
||||||
"""Change unicode output into bytestrings in Python 2
|
|
||||||
|
|
||||||
tzname() API changed in Python 3. It used to return bytes, but was changed
|
|
||||||
to unicode strings
|
|
||||||
"""
|
|
||||||
if PY2:
|
|
||||||
@wraps(namefunc)
|
|
||||||
def adjust_encoding(*args, **kwargs):
|
|
||||||
name = namefunc(*args, **kwargs)
|
|
||||||
if name is not None:
|
|
||||||
name = name.encode()
|
|
||||||
|
|
||||||
return name
|
|
||||||
|
|
||||||
return adjust_encoding
|
|
||||||
else:
|
|
||||||
return namefunc
|
|
||||||
|
|
||||||
|
|
||||||
# The following is adapted from Alexander Belopolsky's tz library
|
|
||||||
# https://github.com/abalkin/tz
|
|
||||||
if hasattr(datetime, 'fold'):
|
|
||||||
# This is the pre-python 3.6 fold situation
|
|
||||||
def enfold(dt, fold=1):
|
|
||||||
"""
|
|
||||||
Provides a unified interface for assigning the ``fold`` attribute to
|
|
||||||
datetimes both before and after the implementation of PEP-495.
|
|
||||||
|
|
||||||
:param fold:
|
|
||||||
The value for the ``fold`` attribute in the returned datetime. This
|
|
||||||
should be either 0 or 1.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
|
|
||||||
``fold`` for all versions of Python. In versions prior to
|
|
||||||
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
|
|
||||||
subclass of :py:class:`datetime.datetime` with the ``fold``
|
|
||||||
attribute added, if ``fold`` is 1.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
return dt.replace(fold=fold)
|
|
||||||
|
|
||||||
else:
|
|
||||||
class _DatetimeWithFold(datetime):
|
|
||||||
"""
|
|
||||||
This is a class designed to provide a PEP 495-compliant interface for
|
|
||||||
Python versions before 3.6. It is used only for dates in a fold, so
|
|
||||||
the ``fold`` attribute is fixed at ``1``.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
__slots__ = ()
|
|
||||||
|
|
||||||
def replace(self, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
Return a datetime with the same attributes, except for those
|
|
||||||
attributes given new values by whichever keyword arguments are
|
|
||||||
specified. Note that tzinfo=None can be specified to create a naive
|
|
||||||
datetime from an aware datetime with no conversion of date and time
|
|
||||||
data.
|
|
||||||
|
|
||||||
This is reimplemented in ``_DatetimeWithFold`` because pypy3 will
|
|
||||||
return a ``datetime.datetime`` even if ``fold`` is unchanged.
|
|
||||||
"""
|
|
||||||
argnames = (
|
|
||||||
'year', 'month', 'day', 'hour', 'minute', 'second',
|
|
||||||
'microsecond', 'tzinfo'
|
|
||||||
)
|
|
||||||
|
|
||||||
for arg, argname in zip(args, argnames):
|
|
||||||
if argname in kwargs:
|
|
||||||
raise TypeError('Duplicate argument: {}'.format(argname))
|
|
||||||
|
|
||||||
kwargs[argname] = arg
|
|
||||||
|
|
||||||
for argname in argnames:
|
|
||||||
if argname not in kwargs:
|
|
||||||
kwargs[argname] = getattr(self, argname)
|
|
||||||
|
|
||||||
dt_class = self.__class__ if kwargs.get('fold', 1) else datetime
|
|
||||||
|
|
||||||
return dt_class(**kwargs)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fold(self):
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def enfold(dt, fold=1):
|
|
||||||
"""
|
|
||||||
Provides a unified interface for assigning the ``fold`` attribute to
|
|
||||||
datetimes both before and after the implementation of PEP-495.
|
|
||||||
|
|
||||||
:param fold:
|
|
||||||
The value for the ``fold`` attribute in the returned datetime. This
|
|
||||||
should be either 0 or 1.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
|
|
||||||
``fold`` for all versions of Python. In versions prior to
|
|
||||||
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
|
|
||||||
subclass of :py:class:`datetime.datetime` with the ``fold``
|
|
||||||
attribute added, if ``fold`` is 1.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
if getattr(dt, 'fold', 0) == fold:
|
|
||||||
return dt
|
|
||||||
|
|
||||||
args = dt.timetuple()[:6]
|
|
||||||
args += (dt.microsecond, dt.tzinfo)
|
|
||||||
|
|
||||||
if fold:
|
|
||||||
return _DatetimeWithFold(*args)
|
|
||||||
else:
|
|
||||||
return datetime(*args)
|
|
||||||
|
|
||||||
|
|
||||||
def _validate_fromutc_inputs(f):
|
|
||||||
"""
|
|
||||||
The CPython version of ``fromutc`` checks that the input is a ``datetime``
|
|
||||||
object and that ``self`` is attached as its ``tzinfo``.
|
|
||||||
"""
|
|
||||||
@wraps(f)
|
|
||||||
def fromutc(self, dt):
|
|
||||||
if not isinstance(dt, datetime):
|
|
||||||
raise TypeError("fromutc() requires a datetime argument")
|
|
||||||
if dt.tzinfo is not self:
|
|
||||||
raise ValueError("dt.tzinfo is not self")
|
|
||||||
|
|
||||||
return f(self, dt)
|
|
||||||
|
|
||||||
return fromutc
|
|
||||||
|
|
||||||
|
|
||||||
class _tzinfo(tzinfo):
|
|
||||||
"""
|
|
||||||
Base class for all ``dateutil`` ``tzinfo`` objects.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def is_ambiguous(self, dt):
|
|
||||||
"""
|
|
||||||
Whether or not the "wall time" of a given datetime is ambiguous in this
|
|
||||||
zone.
|
|
||||||
|
|
||||||
:param dt:
|
|
||||||
A :py:class:`datetime.datetime`, naive or time zone aware.
|
|
||||||
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns ``True`` if ambiguous, ``False`` otherwise.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
|
|
||||||
dt = dt.replace(tzinfo=self)
|
|
||||||
|
|
||||||
wall_0 = enfold(dt, fold=0)
|
|
||||||
wall_1 = enfold(dt, fold=1)
|
|
||||||
|
|
||||||
same_offset = wall_0.utcoffset() == wall_1.utcoffset()
|
|
||||||
same_dt = wall_0.replace(tzinfo=None) == wall_1.replace(tzinfo=None)
|
|
||||||
|
|
||||||
return same_dt and not same_offset
|
|
||||||
|
|
||||||
def _fold_status(self, dt_utc, dt_wall):
|
|
||||||
"""
|
|
||||||
Determine the fold status of a "wall" datetime, given a representation
|
|
||||||
of the same datetime as a (naive) UTC datetime. This is calculated based
|
|
||||||
on the assumption that ``dt.utcoffset() - dt.dst()`` is constant for all
|
|
||||||
datetimes, and that this offset is the actual number of hours separating
|
|
||||||
``dt_utc`` and ``dt_wall``.
|
|
||||||
|
|
||||||
:param dt_utc:
|
|
||||||
Representation of the datetime as UTC
|
|
||||||
|
|
||||||
:param dt_wall:
|
|
||||||
Representation of the datetime as "wall time". This parameter must
|
|
||||||
either have a `fold` attribute or have a fold-naive
|
|
||||||
:class:`datetime.tzinfo` attached, otherwise the calculation may
|
|
||||||
fail.
|
|
||||||
"""
|
|
||||||
if self.is_ambiguous(dt_wall):
|
|
||||||
delta_wall = dt_wall - dt_utc
|
|
||||||
_fold = int(delta_wall == (dt_utc.utcoffset() - dt_utc.dst()))
|
|
||||||
else:
|
|
||||||
_fold = 0
|
|
||||||
|
|
||||||
return _fold
|
|
||||||
|
|
||||||
def _fold(self, dt):
|
|
||||||
return getattr(dt, 'fold', 0)
|
|
||||||
|
|
||||||
def _fromutc(self, dt):
|
|
||||||
"""
|
|
||||||
Given a timezone-aware datetime in a given timezone, calculates a
|
|
||||||
timezone-aware datetime in a new timezone.
|
|
||||||
|
|
||||||
Since this is the one time that we *know* we have an unambiguous
|
|
||||||
datetime object, we take this opportunity to determine whether the
|
|
||||||
datetime is ambiguous and in a "fold" state (e.g. if it's the first
|
|
||||||
occurrence, chronologically, of the ambiguous datetime).
|
|
||||||
|
|
||||||
:param dt:
|
|
||||||
A timezone-aware :class:`datetime.datetime` object.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Re-implement the algorithm from Python's datetime.py
|
|
||||||
dtoff = dt.utcoffset()
|
|
||||||
if dtoff is None:
|
|
||||||
raise ValueError("fromutc() requires a non-None utcoffset() "
|
|
||||||
"result")
|
|
||||||
|
|
||||||
# The original datetime.py code assumes that `dst()` defaults to
|
|
||||||
# zero during ambiguous times. PEP 495 inverts this presumption, so
|
|
||||||
# for pre-PEP 495 versions of python, we need to tweak the algorithm.
|
|
||||||
dtdst = dt.dst()
|
|
||||||
if dtdst is None:
|
|
||||||
raise ValueError("fromutc() requires a non-None dst() result")
|
|
||||||
delta = dtoff - dtdst
|
|
||||||
|
|
||||||
dt += delta
|
|
||||||
# Set fold=1 so we can default to being in the fold for
|
|
||||||
# ambiguous dates.
|
|
||||||
dtdst = enfold(dt, fold=1).dst()
|
|
||||||
if dtdst is None:
|
|
||||||
raise ValueError("fromutc(): dt.dst gave inconsistent "
|
|
||||||
"results; cannot convert")
|
|
||||||
return dt + dtdst
|
|
||||||
|
|
||||||
@_validate_fromutc_inputs
|
|
||||||
def fromutc(self, dt):
|
|
||||||
"""
|
|
||||||
Given a timezone-aware datetime in a given timezone, calculates a
|
|
||||||
timezone-aware datetime in a new timezone.
|
|
||||||
|
|
||||||
Since this is the one time that we *know* we have an unambiguous
|
|
||||||
datetime object, we take this opportunity to determine whether the
|
|
||||||
datetime is ambiguous and in a "fold" state (e.g. if it's the first
|
|
||||||
occurrence, chronologically, of the ambiguous datetime).
|
|
||||||
|
|
||||||
:param dt:
|
|
||||||
A timezone-aware :class:`datetime.datetime` object.
|
|
||||||
"""
|
|
||||||
dt_wall = self._fromutc(dt)
|
|
||||||
|
|
||||||
# Calculate the fold status given the two datetimes.
|
|
||||||
_fold = self._fold_status(dt, dt_wall)
|
|
||||||
|
|
||||||
# Set the default fold value for ambiguous dates
|
|
||||||
return enfold(dt_wall, fold=_fold)
|
|
||||||
|
|
||||||
|
|
||||||
class tzrangebase(_tzinfo):
|
|
||||||
"""
|
|
||||||
This is an abstract base class for time zones represented by an annual
|
|
||||||
transition into and out of DST. Child classes should implement the following
|
|
||||||
methods:
|
|
||||||
|
|
||||||
* ``__init__(self, *args, **kwargs)``
|
|
||||||
* ``transitions(self, year)`` - this is expected to return a tuple of
|
|
||||||
datetimes representing the DST on and off transitions in standard
|
|
||||||
time.
|
|
||||||
|
|
||||||
A fully initialized ``tzrangebase`` subclass should also provide the
|
|
||||||
following attributes:
|
|
||||||
* ``hasdst``: Boolean whether or not the zone uses DST.
|
|
||||||
* ``_dst_offset`` / ``_std_offset``: :class:`datetime.timedelta` objects
|
|
||||||
representing the respective UTC offsets.
|
|
||||||
* ``_dst_abbr`` / ``_std_abbr``: Strings representing the timezone short
|
|
||||||
abbreviations in DST and STD, respectively.
|
|
||||||
* ``_hasdst``: Whether or not the zone has DST.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
raise NotImplementedError('tzrangebase is an abstract base class')
|
|
||||||
|
|
||||||
def utcoffset(self, dt):
|
|
||||||
isdst = self._isdst(dt)
|
|
||||||
|
|
||||||
if isdst is None:
|
|
||||||
return None
|
|
||||||
elif isdst:
|
|
||||||
return self._dst_offset
|
|
||||||
else:
|
|
||||||
return self._std_offset
|
|
||||||
|
|
||||||
def dst(self, dt):
|
|
||||||
isdst = self._isdst(dt)
|
|
||||||
|
|
||||||
if isdst is None:
|
|
||||||
return None
|
|
||||||
elif isdst:
|
|
||||||
return self._dst_base_offset
|
|
||||||
else:
|
|
||||||
return ZERO
|
|
||||||
|
|
||||||
@tzname_in_python2
|
|
||||||
def tzname(self, dt):
|
|
||||||
if self._isdst(dt):
|
|
||||||
return self._dst_abbr
|
|
||||||
else:
|
|
||||||
return self._std_abbr
|
|
||||||
|
|
||||||
def fromutc(self, dt):
|
|
||||||
""" Given a datetime in UTC, return local time """
|
|
||||||
if not isinstance(dt, datetime):
|
|
||||||
raise TypeError("fromutc() requires a datetime argument")
|
|
||||||
|
|
||||||
if dt.tzinfo is not self:
|
|
||||||
raise ValueError("dt.tzinfo is not self")
|
|
||||||
|
|
||||||
# Get transitions - if there are none, fixed offset
|
|
||||||
transitions = self.transitions(dt.year)
|
|
||||||
if transitions is None:
|
|
||||||
return dt + self.utcoffset(dt)
|
|
||||||
|
|
||||||
# Get the transition times in UTC
|
|
||||||
dston, dstoff = transitions
|
|
||||||
|
|
||||||
dston -= self._std_offset
|
|
||||||
dstoff -= self._std_offset
|
|
||||||
|
|
||||||
utc_transitions = (dston, dstoff)
|
|
||||||
dt_utc = dt.replace(tzinfo=None)
|
|
||||||
|
|
||||||
isdst = self._naive_isdst(dt_utc, utc_transitions)
|
|
||||||
|
|
||||||
if isdst:
|
|
||||||
dt_wall = dt + self._dst_offset
|
|
||||||
else:
|
|
||||||
dt_wall = dt + self._std_offset
|
|
||||||
|
|
||||||
_fold = int(not isdst and self.is_ambiguous(dt_wall))
|
|
||||||
|
|
||||||
return enfold(dt_wall, fold=_fold)
|
|
||||||
|
|
||||||
def is_ambiguous(self, dt):
|
|
||||||
"""
|
|
||||||
Whether or not the "wall time" of a given datetime is ambiguous in this
|
|
||||||
zone.
|
|
||||||
|
|
||||||
:param dt:
|
|
||||||
A :py:class:`datetime.datetime`, naive or time zone aware.
|
|
||||||
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns ``True`` if ambiguous, ``False`` otherwise.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
"""
|
|
||||||
if not self.hasdst:
|
|
||||||
return False
|
|
||||||
|
|
||||||
start, end = self.transitions(dt.year)
|
|
||||||
|
|
||||||
dt = dt.replace(tzinfo=None)
|
|
||||||
return (end <= dt < end + self._dst_base_offset)
|
|
||||||
|
|
||||||
def _isdst(self, dt):
|
|
||||||
if not self.hasdst:
|
|
||||||
return False
|
|
||||||
elif dt is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
transitions = self.transitions(dt.year)
|
|
||||||
|
|
||||||
if transitions is None:
|
|
||||||
return False
|
|
||||||
|
|
||||||
dt = dt.replace(tzinfo=None)
|
|
||||||
|
|
||||||
isdst = self._naive_isdst(dt, transitions)
|
|
||||||
|
|
||||||
# Handle ambiguous dates
|
|
||||||
if not isdst and self.is_ambiguous(dt):
|
|
||||||
return not self._fold(dt)
|
|
||||||
else:
|
|
||||||
return isdst
|
|
||||||
|
|
||||||
def _naive_isdst(self, dt, transitions):
|
|
||||||
dston, dstoff = transitions
|
|
||||||
|
|
||||||
dt = dt.replace(tzinfo=None)
|
|
||||||
|
|
||||||
if dston < dstoff:
|
|
||||||
isdst = dston <= dt < dstoff
|
|
||||||
else:
|
|
||||||
isdst = not dstoff <= dt < dston
|
|
||||||
|
|
||||||
return isdst
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _dst_base_offset(self):
|
|
||||||
return self._dst_offset - self._std_offset
|
|
||||||
|
|
||||||
__hash__ = None
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return not (self == other)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "%s(...)" % self.__class__.__name__
|
|
||||||
|
|
||||||
__reduce__ = object.__reduce__
|
|
||||||
@ -1,80 +0,0 @@
|
|||||||
from datetime import timedelta
|
|
||||||
import weakref
|
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from six.moves import _thread
|
|
||||||
|
|
||||||
|
|
||||||
class _TzSingleton(type):
|
|
||||||
def __init__(cls, *args, **kwargs):
|
|
||||||
cls.__instance = None
|
|
||||||
super(_TzSingleton, cls).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def __call__(cls):
|
|
||||||
if cls.__instance is None:
|
|
||||||
cls.__instance = super(_TzSingleton, cls).__call__()
|
|
||||||
return cls.__instance
|
|
||||||
|
|
||||||
|
|
||||||
class _TzFactory(type):
|
|
||||||
def instance(cls, *args, **kwargs):
|
|
||||||
"""Alternate constructor that returns a fresh instance"""
|
|
||||||
return type.__call__(cls, *args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class _TzOffsetFactory(_TzFactory):
|
|
||||||
def __init__(cls, *args, **kwargs):
|
|
||||||
cls.__instances = weakref.WeakValueDictionary()
|
|
||||||
cls.__strong_cache = OrderedDict()
|
|
||||||
cls.__strong_cache_size = 8
|
|
||||||
|
|
||||||
cls._cache_lock = _thread.allocate_lock()
|
|
||||||
|
|
||||||
def __call__(cls, name, offset):
|
|
||||||
if isinstance(offset, timedelta):
|
|
||||||
key = (name, offset.total_seconds())
|
|
||||||
else:
|
|
||||||
key = (name, offset)
|
|
||||||
|
|
||||||
instance = cls.__instances.get(key, None)
|
|
||||||
if instance is None:
|
|
||||||
instance = cls.__instances.setdefault(key,
|
|
||||||
cls.instance(name, offset))
|
|
||||||
|
|
||||||
# This lock may not be necessary in Python 3. See GH issue #901
|
|
||||||
with cls._cache_lock:
|
|
||||||
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
|
|
||||||
|
|
||||||
# Remove an item if the strong cache is overpopulated
|
|
||||||
if len(cls.__strong_cache) > cls.__strong_cache_size:
|
|
||||||
cls.__strong_cache.popitem(last=False)
|
|
||||||
|
|
||||||
return instance
|
|
||||||
|
|
||||||
|
|
||||||
class _TzStrFactory(_TzFactory):
|
|
||||||
def __init__(cls, *args, **kwargs):
|
|
||||||
cls.__instances = weakref.WeakValueDictionary()
|
|
||||||
cls.__strong_cache = OrderedDict()
|
|
||||||
cls.__strong_cache_size = 8
|
|
||||||
|
|
||||||
cls.__cache_lock = _thread.allocate_lock()
|
|
||||||
|
|
||||||
def __call__(cls, s, posix_offset=False):
|
|
||||||
key = (s, posix_offset)
|
|
||||||
instance = cls.__instances.get(key, None)
|
|
||||||
|
|
||||||
if instance is None:
|
|
||||||
instance = cls.__instances.setdefault(key,
|
|
||||||
cls.instance(s, posix_offset))
|
|
||||||
|
|
||||||
# This lock may not be necessary in Python 3. See GH issue #901
|
|
||||||
with cls.__cache_lock:
|
|
||||||
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
|
|
||||||
|
|
||||||
# Remove an item if the strong cache is overpopulated
|
|
||||||
if len(cls.__strong_cache) > cls.__strong_cache_size:
|
|
||||||
cls.__strong_cache.popitem(last=False)
|
|
||||||
|
|
||||||
return instance
|
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,370 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This module provides an interface to the native time zone data on Windows,
|
|
||||||
including :py:class:`datetime.tzinfo` implementations.
|
|
||||||
|
|
||||||
Attempting to import this module on a non-Windows platform will raise an
|
|
||||||
:py:obj:`ImportError`.
|
|
||||||
"""
|
|
||||||
# This code was originally contributed by Jeffrey Harris.
|
|
||||||
import datetime
|
|
||||||
import struct
|
|
||||||
|
|
||||||
from six.moves import winreg
|
|
||||||
from six import text_type
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ctypes
|
|
||||||
from ctypes import wintypes
|
|
||||||
except ValueError:
|
|
||||||
# ValueError is raised on non-Windows systems for some horrible reason.
|
|
||||||
raise ImportError("Running tzwin on non-Windows system")
|
|
||||||
|
|
||||||
from ._common import tzrangebase
|
|
||||||
|
|
||||||
__all__ = ["tzwin", "tzwinlocal", "tzres"]
|
|
||||||
|
|
||||||
ONEWEEK = datetime.timedelta(7)
|
|
||||||
|
|
||||||
TZKEYNAMENT = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones"
|
|
||||||
TZKEYNAME9X = r"SOFTWARE\Microsoft\Windows\CurrentVersion\Time Zones"
|
|
||||||
TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation"
|
|
||||||
|
|
||||||
|
|
||||||
def _settzkeyname():
|
|
||||||
handle = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
|
|
||||||
try:
|
|
||||||
winreg.OpenKey(handle, TZKEYNAMENT).Close()
|
|
||||||
TZKEYNAME = TZKEYNAMENT
|
|
||||||
except WindowsError:
|
|
||||||
TZKEYNAME = TZKEYNAME9X
|
|
||||||
handle.Close()
|
|
||||||
return TZKEYNAME
|
|
||||||
|
|
||||||
|
|
||||||
TZKEYNAME = _settzkeyname()
|
|
||||||
|
|
||||||
|
|
||||||
class tzres(object):
|
|
||||||
"""
|
|
||||||
Class for accessing ``tzres.dll``, which contains timezone name related
|
|
||||||
resources.
|
|
||||||
|
|
||||||
.. versionadded:: 2.5.0
|
|
||||||
"""
|
|
||||||
p_wchar = ctypes.POINTER(wintypes.WCHAR) # Pointer to a wide char
|
|
||||||
|
|
||||||
def __init__(self, tzres_loc='tzres.dll'):
|
|
||||||
# Load the user32 DLL so we can load strings from tzres
|
|
||||||
user32 = ctypes.WinDLL('user32')
|
|
||||||
|
|
||||||
# Specify the LoadStringW function
|
|
||||||
user32.LoadStringW.argtypes = (wintypes.HINSTANCE,
|
|
||||||
wintypes.UINT,
|
|
||||||
wintypes.LPWSTR,
|
|
||||||
ctypes.c_int)
|
|
||||||
|
|
||||||
self.LoadStringW = user32.LoadStringW
|
|
||||||
self._tzres = ctypes.WinDLL(tzres_loc)
|
|
||||||
self.tzres_loc = tzres_loc
|
|
||||||
|
|
||||||
def load_name(self, offset):
|
|
||||||
"""
|
|
||||||
Load a timezone name from a DLL offset (integer).
|
|
||||||
|
|
||||||
>>> from dateutil.tzwin import tzres
|
|
||||||
>>> tzr = tzres()
|
|
||||||
>>> print(tzr.load_name(112))
|
|
||||||
'Eastern Standard Time'
|
|
||||||
|
|
||||||
:param offset:
|
|
||||||
A positive integer value referring to a string from the tzres dll.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Offsets found in the registry are generally of the form
|
|
||||||
``@tzres.dll,-114``. The offset in this case is 114, not -114.
|
|
||||||
|
|
||||||
"""
|
|
||||||
resource = self.p_wchar()
|
|
||||||
lpBuffer = ctypes.cast(ctypes.byref(resource), wintypes.LPWSTR)
|
|
||||||
nchar = self.LoadStringW(self._tzres._handle, offset, lpBuffer, 0)
|
|
||||||
return resource[:nchar]
|
|
||||||
|
|
||||||
def name_from_string(self, tzname_str):
|
|
||||||
"""
|
|
||||||
Parse strings as returned from the Windows registry into the time zone
|
|
||||||
name as defined in the registry.
|
|
||||||
|
|
||||||
>>> from dateutil.tzwin import tzres
|
|
||||||
>>> tzr = tzres()
|
|
||||||
>>> print(tzr.name_from_string('@tzres.dll,-251'))
|
|
||||||
'Dateline Daylight Time'
|
|
||||||
>>> print(tzr.name_from_string('Eastern Standard Time'))
|
|
||||||
'Eastern Standard Time'
|
|
||||||
|
|
||||||
:param tzname_str:
|
|
||||||
A timezone name string as returned from a Windows registry key.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns the localized timezone string from tzres.dll if the string
|
|
||||||
is of the form `@tzres.dll,-offset`, else returns the input string.
|
|
||||||
"""
|
|
||||||
if not tzname_str.startswith('@'):
|
|
||||||
return tzname_str
|
|
||||||
|
|
||||||
name_splt = tzname_str.split(',-')
|
|
||||||
try:
|
|
||||||
offset = int(name_splt[1])
|
|
||||||
except:
|
|
||||||
raise ValueError("Malformed timezone string.")
|
|
||||||
|
|
||||||
return self.load_name(offset)
|
|
||||||
|
|
||||||
|
|
||||||
class tzwinbase(tzrangebase):
|
|
||||||
"""tzinfo class based on win32's timezones available in the registry."""
|
|
||||||
def __init__(self):
|
|
||||||
raise NotImplementedError('tzwinbase is an abstract base class')
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
# Compare on all relevant dimensions, including name.
|
|
||||||
if not isinstance(other, tzwinbase):
|
|
||||||
return NotImplemented
|
|
||||||
|
|
||||||
return (self._std_offset == other._std_offset and
|
|
||||||
self._dst_offset == other._dst_offset and
|
|
||||||
self._stddayofweek == other._stddayofweek and
|
|
||||||
self._dstdayofweek == other._dstdayofweek and
|
|
||||||
self._stdweeknumber == other._stdweeknumber and
|
|
||||||
self._dstweeknumber == other._dstweeknumber and
|
|
||||||
self._stdhour == other._stdhour and
|
|
||||||
self._dsthour == other._dsthour and
|
|
||||||
self._stdminute == other._stdminute and
|
|
||||||
self._dstminute == other._dstminute and
|
|
||||||
self._std_abbr == other._std_abbr and
|
|
||||||
self._dst_abbr == other._dst_abbr)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def list():
|
|
||||||
"""Return a list of all time zones known to the system."""
|
|
||||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
|
||||||
with winreg.OpenKey(handle, TZKEYNAME) as tzkey:
|
|
||||||
result = [winreg.EnumKey(tzkey, i)
|
|
||||||
for i in range(winreg.QueryInfoKey(tzkey)[0])]
|
|
||||||
return result
|
|
||||||
|
|
||||||
def display(self):
|
|
||||||
"""
|
|
||||||
Return the display name of the time zone.
|
|
||||||
"""
|
|
||||||
return self._display
|
|
||||||
|
|
||||||
def transitions(self, year):
|
|
||||||
"""
|
|
||||||
For a given year, get the DST on and off transition times, expressed
|
|
||||||
always on the standard time side. For zones with no transitions, this
|
|
||||||
function returns ``None``.
|
|
||||||
|
|
||||||
:param year:
|
|
||||||
The year whose transitions you would like to query.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`tuple` of :class:`datetime.datetime` objects,
|
|
||||||
``(dston, dstoff)`` for zones with an annual DST transition, or
|
|
||||||
``None`` for fixed offset zones.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not self.hasdst:
|
|
||||||
return None
|
|
||||||
|
|
||||||
dston = picknthweekday(year, self._dstmonth, self._dstdayofweek,
|
|
||||||
self._dsthour, self._dstminute,
|
|
||||||
self._dstweeknumber)
|
|
||||||
|
|
||||||
dstoff = picknthweekday(year, self._stdmonth, self._stddayofweek,
|
|
||||||
self._stdhour, self._stdminute,
|
|
||||||
self._stdweeknumber)
|
|
||||||
|
|
||||||
# Ambiguous dates default to the STD side
|
|
||||||
dstoff -= self._dst_base_offset
|
|
||||||
|
|
||||||
return dston, dstoff
|
|
||||||
|
|
||||||
def _get_hasdst(self):
|
|
||||||
return self._dstmonth != 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _dst_base_offset(self):
|
|
||||||
return self._dst_base_offset_
|
|
||||||
|
|
||||||
|
|
||||||
class tzwin(tzwinbase):
|
|
||||||
"""
|
|
||||||
Time zone object created from the zone info in the Windows registry
|
|
||||||
|
|
||||||
These are similar to :py:class:`dateutil.tz.tzrange` objects in that
|
|
||||||
the time zone data is provided in the format of a single offset rule
|
|
||||||
for either 0 or 2 time zone transitions per year.
|
|
||||||
|
|
||||||
:param: name
|
|
||||||
The name of a Windows time zone key, e.g. "Eastern Standard Time".
|
|
||||||
The full list of keys can be retrieved with :func:`tzwin.list`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
self._name = name
|
|
||||||
|
|
||||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
|
||||||
tzkeyname = text_type("{kn}\\{name}").format(kn=TZKEYNAME, name=name)
|
|
||||||
with winreg.OpenKey(handle, tzkeyname) as tzkey:
|
|
||||||
keydict = valuestodict(tzkey)
|
|
||||||
|
|
||||||
self._std_abbr = keydict["Std"]
|
|
||||||
self._dst_abbr = keydict["Dlt"]
|
|
||||||
|
|
||||||
self._display = keydict["Display"]
|
|
||||||
|
|
||||||
# See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm
|
|
||||||
tup = struct.unpack("=3l16h", keydict["TZI"])
|
|
||||||
stdoffset = -tup[0]-tup[1] # Bias + StandardBias * -1
|
|
||||||
dstoffset = stdoffset-tup[2] # + DaylightBias * -1
|
|
||||||
self._std_offset = datetime.timedelta(minutes=stdoffset)
|
|
||||||
self._dst_offset = datetime.timedelta(minutes=dstoffset)
|
|
||||||
|
|
||||||
# for the meaning see the win32 TIME_ZONE_INFORMATION structure docs
|
|
||||||
# http://msdn.microsoft.com/en-us/library/windows/desktop/ms725481(v=vs.85).aspx
|
|
||||||
(self._stdmonth,
|
|
||||||
self._stddayofweek, # Sunday = 0
|
|
||||||
self._stdweeknumber, # Last = 5
|
|
||||||
self._stdhour,
|
|
||||||
self._stdminute) = tup[4:9]
|
|
||||||
|
|
||||||
(self._dstmonth,
|
|
||||||
self._dstdayofweek, # Sunday = 0
|
|
||||||
self._dstweeknumber, # Last = 5
|
|
||||||
self._dsthour,
|
|
||||||
self._dstminute) = tup[12:17]
|
|
||||||
|
|
||||||
self._dst_base_offset_ = self._dst_offset - self._std_offset
|
|
||||||
self.hasdst = self._get_hasdst()
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "tzwin(%s)" % repr(self._name)
|
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
return (self.__class__, (self._name,))
|
|
||||||
|
|
||||||
|
|
||||||
class tzwinlocal(tzwinbase):
|
|
||||||
"""
|
|
||||||
Class representing the local time zone information in the Windows registry
|
|
||||||
|
|
||||||
While :class:`dateutil.tz.tzlocal` makes system calls (via the :mod:`time`
|
|
||||||
module) to retrieve time zone information, ``tzwinlocal`` retrieves the
|
|
||||||
rules directly from the Windows registry and creates an object like
|
|
||||||
:class:`dateutil.tz.tzwin`.
|
|
||||||
|
|
||||||
Because Windows does not have an equivalent of :func:`time.tzset`, on
|
|
||||||
Windows, :class:`dateutil.tz.tzlocal` instances will always reflect the
|
|
||||||
time zone settings *at the time that the process was started*, meaning
|
|
||||||
changes to the machine's time zone settings during the run of a program
|
|
||||||
on Windows will **not** be reflected by :class:`dateutil.tz.tzlocal`.
|
|
||||||
Because ``tzwinlocal`` reads the registry directly, it is unaffected by
|
|
||||||
this issue.
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
|
||||||
with winreg.OpenKey(handle, TZLOCALKEYNAME) as tzlocalkey:
|
|
||||||
keydict = valuestodict(tzlocalkey)
|
|
||||||
|
|
||||||
self._std_abbr = keydict["StandardName"]
|
|
||||||
self._dst_abbr = keydict["DaylightName"]
|
|
||||||
|
|
||||||
try:
|
|
||||||
tzkeyname = text_type('{kn}\\{sn}').format(kn=TZKEYNAME,
|
|
||||||
sn=self._std_abbr)
|
|
||||||
with winreg.OpenKey(handle, tzkeyname) as tzkey:
|
|
||||||
_keydict = valuestodict(tzkey)
|
|
||||||
self._display = _keydict["Display"]
|
|
||||||
except OSError:
|
|
||||||
self._display = None
|
|
||||||
|
|
||||||
stdoffset = -keydict["Bias"]-keydict["StandardBias"]
|
|
||||||
dstoffset = stdoffset-keydict["DaylightBias"]
|
|
||||||
|
|
||||||
self._std_offset = datetime.timedelta(minutes=stdoffset)
|
|
||||||
self._dst_offset = datetime.timedelta(minutes=dstoffset)
|
|
||||||
|
|
||||||
# For reasons unclear, in this particular key, the day of week has been
|
|
||||||
# moved to the END of the SYSTEMTIME structure.
|
|
||||||
tup = struct.unpack("=8h", keydict["StandardStart"])
|
|
||||||
|
|
||||||
(self._stdmonth,
|
|
||||||
self._stdweeknumber, # Last = 5
|
|
||||||
self._stdhour,
|
|
||||||
self._stdminute) = tup[1:5]
|
|
||||||
|
|
||||||
self._stddayofweek = tup[7]
|
|
||||||
|
|
||||||
tup = struct.unpack("=8h", keydict["DaylightStart"])
|
|
||||||
|
|
||||||
(self._dstmonth,
|
|
||||||
self._dstweeknumber, # Last = 5
|
|
||||||
self._dsthour,
|
|
||||||
self._dstminute) = tup[1:5]
|
|
||||||
|
|
||||||
self._dstdayofweek = tup[7]
|
|
||||||
|
|
||||||
self._dst_base_offset_ = self._dst_offset - self._std_offset
|
|
||||||
self.hasdst = self._get_hasdst()
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "tzwinlocal()"
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
# str will return the standard name, not the daylight name.
|
|
||||||
return "tzwinlocal(%s)" % repr(self._std_abbr)
|
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
return (self.__class__, ())
|
|
||||||
|
|
||||||
|
|
||||||
def picknthweekday(year, month, dayofweek, hour, minute, whichweek):
|
|
||||||
""" dayofweek == 0 means Sunday, whichweek 5 means last instance """
|
|
||||||
first = datetime.datetime(year, month, 1, hour, minute)
|
|
||||||
|
|
||||||
# This will work if dayofweek is ISO weekday (1-7) or Microsoft-style (0-6),
|
|
||||||
# Because 7 % 7 = 0
|
|
||||||
weekdayone = first.replace(day=((dayofweek - first.isoweekday()) % 7) + 1)
|
|
||||||
wd = weekdayone + ((whichweek - 1) * ONEWEEK)
|
|
||||||
if (wd.month != month):
|
|
||||||
wd -= ONEWEEK
|
|
||||||
|
|
||||||
return wd
|
|
||||||
|
|
||||||
|
|
||||||
def valuestodict(key):
|
|
||||||
"""Convert a registry key's values to a dictionary."""
|
|
||||||
dout = {}
|
|
||||||
size = winreg.QueryInfoKey(key)[1]
|
|
||||||
tz_res = None
|
|
||||||
|
|
||||||
for i in range(size):
|
|
||||||
key_name, value, dtype = winreg.EnumValue(key, i)
|
|
||||||
if dtype == winreg.REG_DWORD or dtype == winreg.REG_DWORD_LITTLE_ENDIAN:
|
|
||||||
# If it's a DWORD (32-bit integer), it's stored as unsigned - convert
|
|
||||||
# that to a proper signed integer
|
|
||||||
if value & (1 << 31):
|
|
||||||
value = value - (1 << 32)
|
|
||||||
elif dtype == winreg.REG_SZ:
|
|
||||||
# If it's a reference to the tzres DLL, load the actual string
|
|
||||||
if value.startswith('@tzres'):
|
|
||||||
tz_res = tz_res or tzres()
|
|
||||||
value = tz_res.name_from_string(value)
|
|
||||||
|
|
||||||
value = value.rstrip('\x00') # Remove trailing nulls
|
|
||||||
|
|
||||||
dout[key_name] = value
|
|
||||||
|
|
||||||
return dout
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
# tzwin has moved to dateutil.tz.win
|
|
||||||
from .tz.win import *
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This module offers general convenience and utility functions for dealing with
|
|
||||||
datetimes.
|
|
||||||
|
|
||||||
.. versionadded:: 2.7.0
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from datetime import datetime, time
|
|
||||||
|
|
||||||
|
|
||||||
def today(tzinfo=None):
|
|
||||||
"""
|
|
||||||
Returns a :py:class:`datetime` representing the current day at midnight
|
|
||||||
|
|
||||||
:param tzinfo:
|
|
||||||
The time zone to attach (also used to determine the current day).
|
|
||||||
|
|
||||||
:return:
|
|
||||||
A :py:class:`datetime.datetime` object representing the current day
|
|
||||||
at midnight.
|
|
||||||
"""
|
|
||||||
|
|
||||||
dt = datetime.now(tzinfo)
|
|
||||||
return datetime.combine(dt.date(), time(0, tzinfo=tzinfo))
|
|
||||||
|
|
||||||
|
|
||||||
def default_tzinfo(dt, tzinfo):
|
|
||||||
"""
|
|
||||||
Sets the ``tzinfo`` parameter on naive datetimes only
|
|
||||||
|
|
||||||
This is useful for example when you are provided a datetime that may have
|
|
||||||
either an implicit or explicit time zone, such as when parsing a time zone
|
|
||||||
string.
|
|
||||||
|
|
||||||
.. doctest::
|
|
||||||
|
|
||||||
>>> from dateutil.tz import tzoffset
|
|
||||||
>>> from dateutil.parser import parse
|
|
||||||
>>> from dateutil.utils import default_tzinfo
|
|
||||||
>>> dflt_tz = tzoffset("EST", -18000)
|
|
||||||
>>> print(default_tzinfo(parse('2014-01-01 12:30 UTC'), dflt_tz))
|
|
||||||
2014-01-01 12:30:00+00:00
|
|
||||||
>>> print(default_tzinfo(parse('2014-01-01 12:30'), dflt_tz))
|
|
||||||
2014-01-01 12:30:00-05:00
|
|
||||||
|
|
||||||
:param dt:
|
|
||||||
The datetime on which to replace the time zone
|
|
||||||
|
|
||||||
:param tzinfo:
|
|
||||||
The :py:class:`datetime.tzinfo` subclass instance to assign to
|
|
||||||
``dt`` if (and only if) it is naive.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns an aware :py:class:`datetime.datetime`.
|
|
||||||
"""
|
|
||||||
if dt.tzinfo is not None:
|
|
||||||
return dt
|
|
||||||
else:
|
|
||||||
return dt.replace(tzinfo=tzinfo)
|
|
||||||
|
|
||||||
|
|
||||||
def within_delta(dt1, dt2, delta):
|
|
||||||
"""
|
|
||||||
Useful for comparing two datetimes that may have a negligible difference
|
|
||||||
to be considered equal.
|
|
||||||
"""
|
|
||||||
delta = abs(delta)
|
|
||||||
difference = dt1 - dt2
|
|
||||||
return -delta <= difference <= delta
|
|
||||||
@ -1,167 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import warnings
|
|
||||||
import json
|
|
||||||
|
|
||||||
from tarfile import TarFile
|
|
||||||
from pkgutil import get_data
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from dateutil.tz import tzfile as _tzfile
|
|
||||||
|
|
||||||
__all__ = ["get_zonefile_instance", "gettz", "gettz_db_metadata"]
|
|
||||||
|
|
||||||
ZONEFILENAME = "dateutil-zoneinfo.tar.gz"
|
|
||||||
METADATA_FN = 'METADATA'
|
|
||||||
|
|
||||||
|
|
||||||
class tzfile(_tzfile):
|
|
||||||
def __reduce__(self):
|
|
||||||
return (gettz, (self._filename,))
|
|
||||||
|
|
||||||
|
|
||||||
def getzoneinfofile_stream():
|
|
||||||
try:
|
|
||||||
return BytesIO(get_data(__name__, ZONEFILENAME))
|
|
||||||
except IOError as e: # TODO switch to FileNotFoundError?
|
|
||||||
warnings.warn("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class ZoneInfoFile(object):
|
|
||||||
def __init__(self, zonefile_stream=None):
|
|
||||||
if zonefile_stream is not None:
|
|
||||||
with TarFile.open(fileobj=zonefile_stream) as tf:
|
|
||||||
self.zones = {zf.name: tzfile(tf.extractfile(zf), filename=zf.name)
|
|
||||||
for zf in tf.getmembers()
|
|
||||||
if zf.isfile() and zf.name != METADATA_FN}
|
|
||||||
# deal with links: They'll point to their parent object. Less
|
|
||||||
# waste of memory
|
|
||||||
links = {zl.name: self.zones[zl.linkname]
|
|
||||||
for zl in tf.getmembers() if
|
|
||||||
zl.islnk() or zl.issym()}
|
|
||||||
self.zones.update(links)
|
|
||||||
try:
|
|
||||||
metadata_json = tf.extractfile(tf.getmember(METADATA_FN))
|
|
||||||
metadata_str = metadata_json.read().decode('UTF-8')
|
|
||||||
self.metadata = json.loads(metadata_str)
|
|
||||||
except KeyError:
|
|
||||||
# no metadata in tar file
|
|
||||||
self.metadata = None
|
|
||||||
else:
|
|
||||||
self.zones = {}
|
|
||||||
self.metadata = None
|
|
||||||
|
|
||||||
def get(self, name, default=None):
|
|
||||||
"""
|
|
||||||
Wrapper for :func:`ZoneInfoFile.zones.get`. This is a convenience method
|
|
||||||
for retrieving zones from the zone dictionary.
|
|
||||||
|
|
||||||
:param name:
|
|
||||||
The name of the zone to retrieve. (Generally IANA zone names)
|
|
||||||
|
|
||||||
:param default:
|
|
||||||
The value to return in the event of a missing key.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6.0
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self.zones.get(name, default)
|
|
||||||
|
|
||||||
|
|
||||||
# The current API has gettz as a module function, although in fact it taps into
|
|
||||||
# a stateful class. So as a workaround for now, without changing the API, we
|
|
||||||
# will create a new "global" class instance the first time a user requests a
|
|
||||||
# timezone. Ugly, but adheres to the api.
|
|
||||||
#
|
|
||||||
# TODO: Remove after deprecation period.
|
|
||||||
_CLASS_ZONE_INSTANCE = []
|
|
||||||
|
|
||||||
|
|
||||||
def get_zonefile_instance(new_instance=False):
|
|
||||||
"""
|
|
||||||
This is a convenience function which provides a :class:`ZoneInfoFile`
|
|
||||||
instance using the data provided by the ``dateutil`` package. By default, it
|
|
||||||
caches a single instance of the ZoneInfoFile object and returns that.
|
|
||||||
|
|
||||||
:param new_instance:
|
|
||||||
If ``True``, a new instance of :class:`ZoneInfoFile` is instantiated and
|
|
||||||
used as the cached instance for the next call. Otherwise, new instances
|
|
||||||
are created only as necessary.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`ZoneInfoFile` object.
|
|
||||||
|
|
||||||
.. versionadded:: 2.6
|
|
||||||
"""
|
|
||||||
if new_instance:
|
|
||||||
zif = None
|
|
||||||
else:
|
|
||||||
zif = getattr(get_zonefile_instance, '_cached_instance', None)
|
|
||||||
|
|
||||||
if zif is None:
|
|
||||||
zif = ZoneInfoFile(getzoneinfofile_stream())
|
|
||||||
|
|
||||||
get_zonefile_instance._cached_instance = zif
|
|
||||||
|
|
||||||
return zif
|
|
||||||
|
|
||||||
|
|
||||||
def gettz(name):
|
|
||||||
"""
|
|
||||||
This retrieves a time zone from the local zoneinfo tarball that is packaged
|
|
||||||
with dateutil.
|
|
||||||
|
|
||||||
:param name:
|
|
||||||
An IANA-style time zone name, as found in the zoneinfo file.
|
|
||||||
|
|
||||||
:return:
|
|
||||||
Returns a :class:`dateutil.tz.tzfile` time zone object.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
It is generally inadvisable to use this function, and it is only
|
|
||||||
provided for API compatibility with earlier versions. This is *not*
|
|
||||||
equivalent to ``dateutil.tz.gettz()``, which selects an appropriate
|
|
||||||
time zone based on the inputs, favoring system zoneinfo. This is ONLY
|
|
||||||
for accessing the dateutil-specific zoneinfo (which may be out of
|
|
||||||
date compared to the system zoneinfo).
|
|
||||||
|
|
||||||
.. deprecated:: 2.6
|
|
||||||
If you need to use a specific zoneinfofile over the system zoneinfo,
|
|
||||||
instantiate a :class:`dateutil.zoneinfo.ZoneInfoFile` object and call
|
|
||||||
:func:`dateutil.zoneinfo.ZoneInfoFile.get(name)` instead.
|
|
||||||
|
|
||||||
Use :func:`get_zonefile_instance` to retrieve an instance of the
|
|
||||||
dateutil-provided zoneinfo.
|
|
||||||
"""
|
|
||||||
warnings.warn("zoneinfo.gettz() will be removed in future versions, "
|
|
||||||
"to use the dateutil-provided zoneinfo files, instantiate a "
|
|
||||||
"ZoneInfoFile object and use ZoneInfoFile.zones.get() "
|
|
||||||
"instead. See the documentation for details.",
|
|
||||||
DeprecationWarning)
|
|
||||||
|
|
||||||
if len(_CLASS_ZONE_INSTANCE) == 0:
|
|
||||||
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
|
|
||||||
return _CLASS_ZONE_INSTANCE[0].zones.get(name)
|
|
||||||
|
|
||||||
|
|
||||||
def gettz_db_metadata():
|
|
||||||
""" Get the zonefile metadata
|
|
||||||
|
|
||||||
See `zonefile_metadata`_
|
|
||||||
|
|
||||||
:returns:
|
|
||||||
A dictionary with the database metadata
|
|
||||||
|
|
||||||
.. deprecated:: 2.6
|
|
||||||
See deprecation warning in :func:`zoneinfo.gettz`. To get metadata,
|
|
||||||
query the attribute ``zoneinfo.ZoneInfoFile.metadata``.
|
|
||||||
"""
|
|
||||||
warnings.warn("zoneinfo.gettz_db_metadata() will be removed in future "
|
|
||||||
"versions, to use the dateutil-provided zoneinfo files, "
|
|
||||||
"ZoneInfoFile object and query the 'metadata' attribute "
|
|
||||||
"instead. See the documentation for details.",
|
|
||||||
DeprecationWarning)
|
|
||||||
|
|
||||||
if len(_CLASS_ZONE_INSTANCE) == 0:
|
|
||||||
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
|
|
||||||
return _CLASS_ZONE_INSTANCE[0].metadata
|
|
||||||
@ -1,75 +0,0 @@
|
|||||||
import logging
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
from subprocess import check_call, check_output
|
|
||||||
from tarfile import TarFile
|
|
||||||
|
|
||||||
from dateutil.zoneinfo import METADATA_FN, ZONEFILENAME
|
|
||||||
|
|
||||||
|
|
||||||
def rebuild(filename, tag=None, format="gz", zonegroups=[], metadata=None):
|
|
||||||
"""Rebuild the internal timezone info in dateutil/zoneinfo/zoneinfo*tar*
|
|
||||||
|
|
||||||
filename is the timezone tarball from ``ftp.iana.org/tz``.
|
|
||||||
|
|
||||||
"""
|
|
||||||
tmpdir = tempfile.mkdtemp()
|
|
||||||
zonedir = os.path.join(tmpdir, "zoneinfo")
|
|
||||||
moduledir = os.path.dirname(__file__)
|
|
||||||
try:
|
|
||||||
with TarFile.open(filename) as tf:
|
|
||||||
for name in zonegroups:
|
|
||||||
tf.extract(name, tmpdir)
|
|
||||||
filepaths = [os.path.join(tmpdir, n) for n in zonegroups]
|
|
||||||
|
|
||||||
_run_zic(zonedir, filepaths)
|
|
||||||
|
|
||||||
# write metadata file
|
|
||||||
with open(os.path.join(zonedir, METADATA_FN), 'w') as f:
|
|
||||||
json.dump(metadata, f, indent=4, sort_keys=True)
|
|
||||||
target = os.path.join(moduledir, ZONEFILENAME)
|
|
||||||
with TarFile.open(target, "w:%s" % format) as tf:
|
|
||||||
for entry in os.listdir(zonedir):
|
|
||||||
entrypath = os.path.join(zonedir, entry)
|
|
||||||
tf.add(entrypath, entry)
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(tmpdir)
|
|
||||||
|
|
||||||
|
|
||||||
def _run_zic(zonedir, filepaths):
|
|
||||||
"""Calls the ``zic`` compiler in a compatible way to get a "fat" binary.
|
|
||||||
|
|
||||||
Recent versions of ``zic`` default to ``-b slim``, while older versions
|
|
||||||
don't even have the ``-b`` option (but default to "fat" binaries). The
|
|
||||||
current version of dateutil does not support Version 2+ TZif files, which
|
|
||||||
causes problems when used in conjunction with "slim" binaries, so this
|
|
||||||
function is used to ensure that we always get a "fat" binary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
help_text = check_output(["zic", "--help"])
|
|
||||||
except OSError as e:
|
|
||||||
_print_on_nosuchfile(e)
|
|
||||||
raise
|
|
||||||
|
|
||||||
if b"-b " in help_text:
|
|
||||||
bloat_args = ["-b", "fat"]
|
|
||||||
else:
|
|
||||||
bloat_args = []
|
|
||||||
|
|
||||||
check_call(["zic"] + bloat_args + ["-d", zonedir] + filepaths)
|
|
||||||
|
|
||||||
|
|
||||||
def _print_on_nosuchfile(e):
|
|
||||||
"""Print helpful troubleshooting message
|
|
||||||
|
|
||||||
e is an exception raised by subprocess.check_call()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if e.errno == 2:
|
|
||||||
logging.error(
|
|
||||||
"Could not find zic. Perhaps you need to install "
|
|
||||||
"libc-bin or some other package that provides it, "
|
|
||||||
"or it's not in your PATH?")
|
|
||||||
@ -1 +0,0 @@
|
|||||||
pip
|
|
||||||
@ -1,49 +0,0 @@
|
|||||||
PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
|
|
||||||
--------------------------------------------
|
|
||||||
|
|
||||||
1. This LICENSE AGREEMENT is between the Python Software Foundation
|
|
||||||
("PSF"), and the Individual or Organization ("Licensee") accessing and
|
|
||||||
otherwise using this software ("Python") in source or binary form and
|
|
||||||
its associated documentation.
|
|
||||||
|
|
||||||
2. Subject to the terms and conditions of this License Agreement, PSF
|
|
||||||
hereby grants Licensee a nonexclusive, royalty-free, world-wide
|
|
||||||
license to reproduce, analyze, test, perform and/or display publicly,
|
|
||||||
prepare derivative works, distribute, and otherwise use Python
|
|
||||||
alone or in any derivative version, provided, however, that PSF's
|
|
||||||
License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
|
|
||||||
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 Python Software Foundation;
|
|
||||||
All Rights Reserved" are retained in Python alone or in any derivative
|
|
||||||
version prepared by Licensee.
|
|
||||||
|
|
||||||
3. In the event Licensee prepares a derivative work that is based on
|
|
||||||
or incorporates Python or any part thereof, and wants to make
|
|
||||||
the derivative work available to others as provided herein, then
|
|
||||||
Licensee hereby agrees to include in any such work a brief summary of
|
|
||||||
the changes made to Python.
|
|
||||||
|
|
||||||
4. PSF is making Python available to Licensee on an "AS IS"
|
|
||||||
basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
|
||||||
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
|
|
||||||
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
|
|
||||||
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
|
|
||||||
INFRINGE ANY THIRD PARTY RIGHTS.
|
|
||||||
|
|
||||||
5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
|
|
||||||
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
|
|
||||||
A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
|
|
||||||
OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
|
|
||||||
|
|
||||||
6. This License Agreement will automatically terminate upon a material
|
|
||||||
breach of its terms and conditions.
|
|
||||||
|
|
||||||
7. Nothing in this License Agreement shall be deemed to create any
|
|
||||||
relationship of agency, partnership, or joint venture between PSF and
|
|
||||||
Licensee. This License Agreement does not grant permission to use PSF
|
|
||||||
trademarks or trade name in a trademark sense to endorse or promote
|
|
||||||
products or services of Licensee, or any third party.
|
|
||||||
|
|
||||||
8. By copying, installing or otherwise using Python, Licensee
|
|
||||||
agrees to be bound by the terms and conditions of this License
|
|
||||||
Agreement.
|
|
||||||
|
|
||||||
@ -1,978 +0,0 @@
|
|||||||
Metadata-Version: 2.1
|
|
||||||
Name: defusedxml
|
|
||||||
Version: 0.7.1
|
|
||||||
Summary: XML bomb protection for Python stdlib modules
|
|
||||||
Home-page: https://github.com/tiran/defusedxml
|
|
||||||
Author: Christian Heimes
|
|
||||||
Author-email: christian@python.org
|
|
||||||
Maintainer: Christian Heimes
|
|
||||||
Maintainer-email: christian@python.org
|
|
||||||
License: PSFL
|
|
||||||
Download-URL: https://pypi.python.org/pypi/defusedxml
|
|
||||||
Keywords: xml bomb DoS
|
|
||||||
Platform: all
|
|
||||||
Classifier: Development Status :: 5 - Production/Stable
|
|
||||||
Classifier: Intended Audience :: Developers
|
|
||||||
Classifier: License :: OSI Approved :: Python Software Foundation License
|
|
||||||
Classifier: Natural Language :: English
|
|
||||||
Classifier: Programming Language :: Python
|
|
||||||
Classifier: Programming Language :: Python :: 2
|
|
||||||
Classifier: Programming Language :: Python :: 2.7
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3.5
|
|
||||||
Classifier: Programming Language :: Python :: 3.6
|
|
||||||
Classifier: Programming Language :: Python :: 3.7
|
|
||||||
Classifier: Programming Language :: Python :: 3.8
|
|
||||||
Classifier: Programming Language :: Python :: 3.9
|
|
||||||
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
||||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
|
|
||||||
|
|
||||||
===================================================
|
|
||||||
defusedxml -- defusing XML bombs and other exploits
|
|
||||||
===================================================
|
|
||||||
|
|
||||||
.. image:: https://img.shields.io/pypi/v/defusedxml.svg
|
|
||||||
:target: https://pypi.org/project/defusedxml/
|
|
||||||
:alt: Latest Version
|
|
||||||
|
|
||||||
.. image:: https://img.shields.io/pypi/pyversions/defusedxml.svg
|
|
||||||
:target: https://pypi.org/project/defusedxml/
|
|
||||||
:alt: Supported Python versions
|
|
||||||
|
|
||||||
.. image:: https://travis-ci.org/tiran/defusedxml.svg?branch=master
|
|
||||||
:target: https://travis-ci.org/tiran/defusedxml
|
|
||||||
:alt: Travis CI
|
|
||||||
|
|
||||||
.. image:: https://codecov.io/github/tiran/defusedxml/coverage.svg?branch=master
|
|
||||||
:target: https://codecov.io/github/tiran/defusedxml?branch=master
|
|
||||||
:alt: codecov
|
|
||||||
|
|
||||||
.. image:: https://img.shields.io/pypi/dm/defusedxml.svg
|
|
||||||
:target: https://pypistats.org/packages/defusedxml
|
|
||||||
:alt: PyPI downloads
|
|
||||||
|
|
||||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
|
||||||
:target: https://github.com/psf/black
|
|
||||||
:alt: Code style: black
|
|
||||||
|
|
||||||
..
|
|
||||||
|
|
||||||
"It's just XML, what could probably go wrong?"
|
|
||||||
|
|
||||||
Christian Heimes <christian@python.org>
|
|
||||||
|
|
||||||
Synopsis
|
|
||||||
========
|
|
||||||
|
|
||||||
The results of an attack on a vulnerable XML library can be fairly dramatic.
|
|
||||||
With just a few hundred **Bytes** of XML data an attacker can occupy several
|
|
||||||
**Gigabytes** of memory within **seconds**. An attacker can also keep
|
|
||||||
CPUs busy for a long time with a small to medium size request. Under some
|
|
||||||
circumstances it is even possible to access local files on your
|
|
||||||
server, to circumvent a firewall, or to abuse services to rebound attacks to
|
|
||||||
third parties.
|
|
||||||
|
|
||||||
The attacks use and abuse less common features of XML and its parsers. The
|
|
||||||
majority of developers are unacquainted with features such as processing
|
|
||||||
instructions and entity expansions that XML inherited from SGML. At best
|
|
||||||
they know about ``<!DOCTYPE>`` from experience with HTML but they are not
|
|
||||||
aware that a document type definition (DTD) can generate an HTTP request
|
|
||||||
or load a file from the file system.
|
|
||||||
|
|
||||||
None of the issues is new. They have been known for a long time. Billion
|
|
||||||
laughs was first reported in 2003. Nevertheless some XML libraries and
|
|
||||||
applications are still vulnerable and even heavy users of XML are
|
|
||||||
surprised by these features. It's hard to say whom to blame for the
|
|
||||||
situation. It's too short sighted to shift all blame on XML parsers and
|
|
||||||
XML libraries for using insecure default settings. After all they
|
|
||||||
properly implement XML specifications. Application developers must not rely
|
|
||||||
that a library is always configured for security and potential harmful data
|
|
||||||
by default.
|
|
||||||
|
|
||||||
|
|
||||||
.. contents:: Table of Contents
|
|
||||||
:depth: 2
|
|
||||||
|
|
||||||
|
|
||||||
Attack vectors
|
|
||||||
==============
|
|
||||||
|
|
||||||
billion laughs / exponential entity expansion
|
|
||||||
---------------------------------------------
|
|
||||||
|
|
||||||
The `Billion Laughs`_ attack -- also known as exponential entity expansion --
|
|
||||||
uses multiple levels of nested entities. The original example uses 9 levels
|
|
||||||
of 10 expansions in each level to expand the string ``lol`` to a string of
|
|
||||||
3 * 10 :sup:`9` bytes, hence the name "billion laughs". The resulting string
|
|
||||||
occupies 3 GB (2.79 GiB) of memory; intermediate strings require additional
|
|
||||||
memory. Because most parsers don't cache the intermediate step for every
|
|
||||||
expansion it is repeated over and over again. It increases the CPU load even
|
|
||||||
more.
|
|
||||||
|
|
||||||
An XML document of just a few hundred bytes can disrupt all services on a
|
|
||||||
machine within seconds.
|
|
||||||
|
|
||||||
Example XML::
|
|
||||||
|
|
||||||
<!DOCTYPE xmlbomb [
|
|
||||||
<!ENTITY a "1234567890" >
|
|
||||||
<!ENTITY b "&a;&a;&a;&a;&a;&a;&a;&a;">
|
|
||||||
<!ENTITY c "&b;&b;&b;&b;&b;&b;&b;&b;">
|
|
||||||
<!ENTITY d "&c;&c;&c;&c;&c;&c;&c;&c;">
|
|
||||||
]>
|
|
||||||
<bomb>&d;</bomb>
|
|
||||||
|
|
||||||
|
|
||||||
quadratic blowup entity expansion
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
A quadratic blowup attack is similar to a `Billion Laughs`_ attack; it abuses
|
|
||||||
entity expansion, too. Instead of nested entities it repeats one large entity
|
|
||||||
with a couple of thousand chars over and over again. The attack isn't as
|
|
||||||
efficient as the exponential case but it avoids triggering countermeasures of
|
|
||||||
parsers against heavily nested entities. Some parsers limit the depth and
|
|
||||||
breadth of a single entity but not the total amount of expanded text
|
|
||||||
throughout an entire XML document.
|
|
||||||
|
|
||||||
A medium-sized XML document with a couple of hundred kilobytes can require a
|
|
||||||
couple of hundred MB to several GB of memory. When the attack is combined
|
|
||||||
with some level of nested expansion an attacker is able to achieve a higher
|
|
||||||
ratio of success.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
<!DOCTYPE bomb [
|
|
||||||
<!ENTITY a "xxxxxxx... a couple of ten thousand chars">
|
|
||||||
]>
|
|
||||||
<bomb>&a;&a;&a;... repeat</bomb>
|
|
||||||
|
|
||||||
|
|
||||||
external entity expansion (remote)
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
Entity declarations can contain more than just text for replacement. They can
|
|
||||||
also point to external resources by public identifiers or system identifiers.
|
|
||||||
System identifiers are standard URIs. When the URI is a URL (e.g. a
|
|
||||||
``http://`` locator) some parsers download the resource from the remote
|
|
||||||
location and embed them into the XML document verbatim.
|
|
||||||
|
|
||||||
Simple example of a parsed external entity::
|
|
||||||
|
|
||||||
<!DOCTYPE external [
|
|
||||||
<!ENTITY ee SYSTEM "http://www.python.org/some.xml">
|
|
||||||
]>
|
|
||||||
<root>ⅇ</root>
|
|
||||||
|
|
||||||
The case of parsed external entities works only for valid XML content. The
|
|
||||||
XML standard also supports unparsed external entities with a
|
|
||||||
``NData declaration``.
|
|
||||||
|
|
||||||
External entity expansion opens the door to plenty of exploits. An attacker
|
|
||||||
can abuse a vulnerable XML library and application to rebound and forward
|
|
||||||
network requests with the IP address of the server. It highly depends
|
|
||||||
on the parser and the application what kind of exploit is possible. For
|
|
||||||
example:
|
|
||||||
|
|
||||||
* An attacker can circumvent firewalls and gain access to restricted
|
|
||||||
resources as all the requests are made from an internal and trustworthy
|
|
||||||
IP address, not from the outside.
|
|
||||||
* An attacker can abuse a service to attack, spy on or DoS your servers but
|
|
||||||
also third party services. The attack is disguised with the IP address of
|
|
||||||
the server and the attacker is able to utilize the high bandwidth of a big
|
|
||||||
machine.
|
|
||||||
* An attacker can exhaust additional resources on the machine, e.g. with
|
|
||||||
requests to a service that doesn't respond or responds with very large
|
|
||||||
files.
|
|
||||||
* An attacker may gain knowledge, when, how often and from which IP address
|
|
||||||
an XML document is accessed.
|
|
||||||
* An attacker could send mail from inside your network if the URL handler
|
|
||||||
supports ``smtp://`` URIs.
|
|
||||||
|
|
||||||
|
|
||||||
external entity expansion (local file)
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
External entities with references to local files are a sub-case of external
|
|
||||||
entity expansion. It's listed as an extra attack because it deserves extra
|
|
||||||
attention. Some XML libraries such as lxml disable network access by default
|
|
||||||
but still allow entity expansion with local file access by default. Local
|
|
||||||
files are either referenced with a ``file://`` URL or by a file path (either
|
|
||||||
relative or absolute).
|
|
||||||
|
|
||||||
An attacker may be able to access and download all files that can be read by
|
|
||||||
the application process. This may include critical configuration files, too.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
<!DOCTYPE external [
|
|
||||||
<!ENTITY ee SYSTEM "file:///PATH/TO/simple.xml">
|
|
||||||
]>
|
|
||||||
<root>ⅇ</root>
|
|
||||||
|
|
||||||
|
|
||||||
DTD retrieval
|
|
||||||
-------------
|
|
||||||
|
|
||||||
This case is similar to external entity expansion, too. Some XML libraries
|
|
||||||
like Python's xml.dom.pulldom retrieve document type definitions from remote
|
|
||||||
or local locations. Several attack scenarios from the external entity case
|
|
||||||
apply to this issue as well.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
||||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
||||||
<html>
|
|
||||||
<head/>
|
|
||||||
<body>text</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
|
|
||||||
Python XML Libraries
|
|
||||||
====================
|
|
||||||
|
|
||||||
.. csv-table:: vulnerabilities and features
|
|
||||||
:header: "kind", "sax", "etree", "minidom", "pulldom", "xmlrpc", "lxml", "genshi"
|
|
||||||
:widths: 24, 7, 8, 8, 7, 8, 8, 8
|
|
||||||
:stub-columns: 0
|
|
||||||
|
|
||||||
"billion laughs", "**True**", "**True**", "**True**", "**True**", "**True**", "False (1)", "False (5)"
|
|
||||||
"quadratic blowup", "**True**", "**True**", "**True**", "**True**", "**True**", "**True**", "False (5)"
|
|
||||||
"external entity expansion (remote)", "**True**", "False (3)", "False (4)", "**True**", "false", "False (1)", "False (5)"
|
|
||||||
"external entity expansion (local file)", "**True**", "False (3)", "False (4)", "**True**", "false", "**True**", "False (5)"
|
|
||||||
"DTD retrieval", "**True**", "False", "False", "**True**", "false", "False (1)", "False"
|
|
||||||
"gzip bomb", "False", "False", "False", "False", "**True**", "**partly** (2)", "False"
|
|
||||||
"xpath support (7)", "False", "False", "False", "False", "False", "**True**", "False"
|
|
||||||
"xsl(t) support (7)", "False", "False", "False", "False", "False", "**True**", "False"
|
|
||||||
"xinclude support (7)", "False", "**True** (6)", "False", "False", "False", "**True** (6)", "**True**"
|
|
||||||
"C library", "expat", "expat", "expat", "expat", "expat", "libxml2", "expat"
|
|
||||||
|
|
||||||
1. Lxml is protected against billion laughs attacks and doesn't do network
|
|
||||||
lookups by default.
|
|
||||||
2. libxml2 and lxml are not directly vulnerable to gzip decompression bombs
|
|
||||||
but they don't protect you against them either.
|
|
||||||
3. xml.etree doesn't expand entities and raises a ParserError when an entity
|
|
||||||
occurs.
|
|
||||||
4. minidom doesn't expand entities and simply returns the unexpanded entity
|
|
||||||
verbatim.
|
|
||||||
5. genshi.input of genshi 0.6 doesn't support entity expansion and raises a
|
|
||||||
ParserError when an entity occurs.
|
|
||||||
6. Library has (limited) XInclude support but requires an additional step to
|
|
||||||
process inclusion.
|
|
||||||
7. These are features but they may introduce exploitable holes, see
|
|
||||||
`Other things to consider`_
|
|
||||||
|
|
||||||
|
|
||||||
Settings in standard library
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
xml.sax.handler Features
|
|
||||||
........................
|
|
||||||
|
|
||||||
feature_external_ges (http://xml.org/sax/features/external-general-entities)
|
|
||||||
disables external entity expansion
|
|
||||||
|
|
||||||
feature_external_pes (http://xml.org/sax/features/external-parameter-entities)
|
|
||||||
the option is ignored and doesn't modify any functionality
|
|
||||||
|
|
||||||
DOM xml.dom.xmlbuilder.Options
|
|
||||||
..............................
|
|
||||||
|
|
||||||
external_parameter_entities
|
|
||||||
ignored
|
|
||||||
|
|
||||||
external_general_entities
|
|
||||||
ignored
|
|
||||||
|
|
||||||
external_dtd_subset
|
|
||||||
ignored
|
|
||||||
|
|
||||||
entities
|
|
||||||
unsure
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml
|
|
||||||
==========
|
|
||||||
|
|
||||||
The `defusedxml package`_ (`defusedxml on PyPI`_)
|
|
||||||
contains several Python-only workarounds and fixes
|
|
||||||
for denial of service and other vulnerabilities in Python's XML libraries.
|
|
||||||
In order to benefit from the protection you just have to import and use the
|
|
||||||
listed functions / classes from the right defusedxml module instead of the
|
|
||||||
original module. Merely `defusedxml.xmlrpc`_ is implemented as monkey patch.
|
|
||||||
|
|
||||||
Instead of::
|
|
||||||
|
|
||||||
>>> from xml.etree.ElementTree import parse
|
|
||||||
>>> et = parse(xmlfile)
|
|
||||||
|
|
||||||
alter code to::
|
|
||||||
|
|
||||||
>>> from defusedxml.ElementTree import parse
|
|
||||||
>>> et = parse(xmlfile)
|
|
||||||
|
|
||||||
Additionally the package has an **untested** function to monkey patch
|
|
||||||
all stdlib modules with ``defusedxml.defuse_stdlib()``.
|
|
||||||
|
|
||||||
All functions and parser classes accept three additional keyword arguments.
|
|
||||||
They return either the same objects as the original functions or compatible
|
|
||||||
subclasses.
|
|
||||||
|
|
||||||
forbid_dtd (default: False)
|
|
||||||
disallow XML with a ``<!DOCTYPE>`` processing instruction and raise a
|
|
||||||
*DTDForbidden* exception when a DTD processing instruction is found.
|
|
||||||
|
|
||||||
forbid_entities (default: True)
|
|
||||||
disallow XML with ``<!ENTITY>`` declarations inside the DTD and raise an
|
|
||||||
*EntitiesForbidden* exception when an entity is declared.
|
|
||||||
|
|
||||||
forbid_external (default: True)
|
|
||||||
disallow any access to remote or local resources in external entities
|
|
||||||
or DTD and raising an *ExternalReferenceForbidden* exception when a DTD
|
|
||||||
or entity references an external resource.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml (package)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
DefusedXmlException, DTDForbidden, EntitiesForbidden,
|
|
||||||
ExternalReferenceForbidden, NotSupportedError
|
|
||||||
|
|
||||||
defuse_stdlib() (*experimental*)
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.cElementTree
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
**NOTE** ``defusedxml.cElementTree`` is deprecated and will be removed in a
|
|
||||||
future release. Import from ``defusedxml.ElementTree`` instead.
|
|
||||||
|
|
||||||
parse(), iterparse(), fromstring(), XMLParser
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.ElementTree
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
parse(), iterparse(), fromstring(), XMLParser
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.expatreader
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
create_parser(), DefusedExpatParser
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.sax
|
|
||||||
--------------
|
|
||||||
|
|
||||||
parse(), parseString(), make_parser()
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.expatbuilder
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
parse(), parseString(), DefusedExpatBuilder, DefusedExpatBuilderNS
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.minidom
|
|
||||||
------------------
|
|
||||||
|
|
||||||
parse(), parseString()
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.pulldom
|
|
||||||
------------------
|
|
||||||
|
|
||||||
parse(), parseString()
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.xmlrpc
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
The fix is implemented as monkey patch for the stdlib's xmlrpc package (3.x)
|
|
||||||
or xmlrpclib module (2.x). The function `monkey_patch()` enables the fixes,
|
|
||||||
`unmonkey_patch()` removes the patch and puts the code in its former state.
|
|
||||||
|
|
||||||
The monkey patch protects against XML related attacks as well as
|
|
||||||
decompression bombs and excessively large requests or responses. The default
|
|
||||||
setting is 30 MB for requests, responses and gzip decompression. You can
|
|
||||||
modify the default by changing the module variable `MAX_DATA`. A value of
|
|
||||||
`-1` disables the limit.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml.lxml
|
|
||||||
---------------
|
|
||||||
|
|
||||||
**DEPRECATED** The module is deprecated and will be removed in a future
|
|
||||||
release.
|
|
||||||
|
|
||||||
The module acts as an *example* how you could protect code that uses
|
|
||||||
lxml.etree. It implements a custom Element class that filters out
|
|
||||||
Entity instances, a custom parser factory and a thread local storage for
|
|
||||||
parser instances. It also has a check_docinfo() function which inspects
|
|
||||||
a tree for internal or external DTDs and entity declarations. In order to
|
|
||||||
check for entities lxml > 3.0 is required.
|
|
||||||
|
|
||||||
parse(), fromstring()
|
|
||||||
RestrictedElement, GlobalParserTLS, getDefaultParser(), check_docinfo()
|
|
||||||
|
|
||||||
|
|
||||||
defusedexpat
|
|
||||||
============
|
|
||||||
|
|
||||||
The `defusedexpat package`_ (`defusedexpat on PyPI`_)
|
|
||||||
comes with binary extensions and a
|
|
||||||
`modified expat`_ library instead of the standard `expat parser`_. It's
|
|
||||||
basically a stand-alone version of the patches for Python's standard
|
|
||||||
library C extensions.
|
|
||||||
|
|
||||||
Modifications in expat
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
new definitions::
|
|
||||||
|
|
||||||
XML_BOMB_PROTECTION
|
|
||||||
XML_DEFAULT_MAX_ENTITY_INDIRECTIONS
|
|
||||||
XML_DEFAULT_MAX_ENTITY_EXPANSIONS
|
|
||||||
XML_DEFAULT_RESET_DTD
|
|
||||||
|
|
||||||
new XML_FeatureEnum members::
|
|
||||||
|
|
||||||
XML_FEATURE_MAX_ENTITY_INDIRECTIONS
|
|
||||||
XML_FEATURE_MAX_ENTITY_EXPANSIONS
|
|
||||||
XML_FEATURE_IGNORE_DTD
|
|
||||||
|
|
||||||
new XML_Error members::
|
|
||||||
|
|
||||||
XML_ERROR_ENTITY_INDIRECTIONS
|
|
||||||
XML_ERROR_ENTITY_EXPANSION
|
|
||||||
|
|
||||||
new API functions::
|
|
||||||
|
|
||||||
int XML_GetFeature(XML_Parser parser,
|
|
||||||
enum XML_FeatureEnum feature,
|
|
||||||
long *value);
|
|
||||||
int XML_SetFeature(XML_Parser parser,
|
|
||||||
enum XML_FeatureEnum feature,
|
|
||||||
long value);
|
|
||||||
int XML_GetFeatureDefault(enum XML_FeatureEnum feature,
|
|
||||||
long *value);
|
|
||||||
int XML_SetFeatureDefault(enum XML_FeatureEnum feature,
|
|
||||||
long value);
|
|
||||||
|
|
||||||
XML_FEATURE_MAX_ENTITY_INDIRECTIONS
|
|
||||||
Limit the amount of indirections that are allowed to occur during the
|
|
||||||
expansion of a nested entity. A counter starts when an entity reference
|
|
||||||
is encountered. It resets after the entity is fully expanded. The limit
|
|
||||||
protects the parser against exponential entity expansion attacks (aka
|
|
||||||
billion laughs attack). When the limit is exceeded the parser stops and
|
|
||||||
fails with `XML_ERROR_ENTITY_INDIRECTIONS`.
|
|
||||||
A value of 0 disables the protection.
|
|
||||||
|
|
||||||
Supported range
|
|
||||||
0 .. UINT_MAX
|
|
||||||
Default
|
|
||||||
40
|
|
||||||
|
|
||||||
XML_FEATURE_MAX_ENTITY_EXPANSIONS
|
|
||||||
Limit the total length of all entity expansions throughout the entire
|
|
||||||
document. The lengths of all entities are accumulated in a parser variable.
|
|
||||||
The setting protects against quadratic blowup attacks (lots of expansions
|
|
||||||
of a large entity declaration). When the sum of all entities exceeds
|
|
||||||
the limit, the parser stops and fails with `XML_ERROR_ENTITY_EXPANSION`.
|
|
||||||
A value of 0 disables the protection.
|
|
||||||
|
|
||||||
Supported range
|
|
||||||
0 .. UINT_MAX
|
|
||||||
Default
|
|
||||||
8 MiB
|
|
||||||
|
|
||||||
XML_FEATURE_RESET_DTD
|
|
||||||
Reset all DTD information after the <!DOCTYPE> block has been parsed. When
|
|
||||||
the flag is set (default: false) all DTD information after the
|
|
||||||
endDoctypeDeclHandler has been called. The flag can be set inside the
|
|
||||||
endDoctypeDeclHandler. Without DTD information any entity reference in
|
|
||||||
the document body leads to `XML_ERROR_UNDEFINED_ENTITY`.
|
|
||||||
|
|
||||||
Supported range
|
|
||||||
0, 1
|
|
||||||
Default
|
|
||||||
0
|
|
||||||
|
|
||||||
|
|
||||||
How to avoid XML vulnerabilities
|
|
||||||
================================
|
|
||||||
|
|
||||||
Best practices
|
|
||||||
--------------
|
|
||||||
|
|
||||||
* Don't allow DTDs
|
|
||||||
* Don't expand entities
|
|
||||||
* Don't resolve externals
|
|
||||||
* Limit parse depth
|
|
||||||
* Limit total input size
|
|
||||||
* Limit parse time
|
|
||||||
* Favor a SAX or iterparse-like parser for potential large data
|
|
||||||
* Validate and properly quote arguments to XSL transformations and
|
|
||||||
XPath queries
|
|
||||||
* Don't use XPath expression from untrusted sources
|
|
||||||
* Don't apply XSL transformations that come untrusted sources
|
|
||||||
|
|
||||||
(based on Brad Hill's `Attacking XML Security`_)
|
|
||||||
|
|
||||||
|
|
||||||
Other things to consider
|
|
||||||
========================
|
|
||||||
|
|
||||||
XML, XML parsers and processing libraries have more features and possible
|
|
||||||
issue that could lead to DoS vulnerabilities or security exploits in
|
|
||||||
applications. I have compiled an incomplete list of theoretical issues that
|
|
||||||
need further research and more attention. The list is deliberately pessimistic
|
|
||||||
and a bit paranoid, too. It contains things that might go wrong under daffy
|
|
||||||
circumstances.
|
|
||||||
|
|
||||||
|
|
||||||
attribute blowup / hash collision attack
|
|
||||||
----------------------------------------
|
|
||||||
|
|
||||||
XML parsers may use an algorithm with quadratic runtime O(n :sup:`2`) to
|
|
||||||
handle attributes and namespaces. If it uses hash tables (dictionaries) to
|
|
||||||
store attributes and namespaces the implementation may be vulnerable to
|
|
||||||
hash collision attacks, thus reducing the performance to O(n :sup:`2`) again.
|
|
||||||
In either case an attacker is able to forge a denial of service attack with
|
|
||||||
an XML document that contains thousands upon thousands of attributes in
|
|
||||||
a single node.
|
|
||||||
|
|
||||||
I haven't researched yet if expat, pyexpat or libxml2 are vulnerable.
|
|
||||||
|
|
||||||
|
|
||||||
decompression bomb
|
|
||||||
------------------
|
|
||||||
|
|
||||||
The issue of decompression bombs (aka `ZIP bomb`_) apply to all XML libraries
|
|
||||||
that can parse compressed XML stream like gzipped HTTP streams or LZMA-ed
|
|
||||||
files. For an attacker it can reduce the amount of transmitted data by three
|
|
||||||
magnitudes or more. Gzip is able to compress 1 GiB zeros to roughly 1 MB,
|
|
||||||
lzma is even better::
|
|
||||||
|
|
||||||
$ dd if=/dev/zero bs=1M count=1024 | gzip > zeros.gz
|
|
||||||
$ dd if=/dev/zero bs=1M count=1024 | lzma -z > zeros.xy
|
|
||||||
$ ls -sh zeros.*
|
|
||||||
1020K zeros.gz
|
|
||||||
148K zeros.xy
|
|
||||||
|
|
||||||
None of Python's standard XML libraries decompress streams except for
|
|
||||||
``xmlrpclib``. The module is vulnerable <https://bugs.python.org/issue16043>
|
|
||||||
to decompression bombs.
|
|
||||||
|
|
||||||
lxml can load and process compressed data through libxml2 transparently.
|
|
||||||
libxml2 can handle even very large blobs of compressed data efficiently
|
|
||||||
without using too much memory. But it doesn't protect applications from
|
|
||||||
decompression bombs. A carefully written SAX or iterparse-like approach can
|
|
||||||
be safe.
|
|
||||||
|
|
||||||
|
|
||||||
Processing Instruction
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
`PI`_'s like::
|
|
||||||
|
|
||||||
<?xml-stylesheet type="text/xsl" href="style.xsl"?>
|
|
||||||
|
|
||||||
may impose more threats for XML processing. It depends if and how a
|
|
||||||
processor handles processing instructions. The issue of URL retrieval with
|
|
||||||
network or local file access apply to processing instructions, too.
|
|
||||||
|
|
||||||
|
|
||||||
Other DTD features
|
|
||||||
------------------
|
|
||||||
|
|
||||||
`DTD`_ has more features like ``<!NOTATION>``. I haven't researched how
|
|
||||||
these features may be a security threat.
|
|
||||||
|
|
||||||
|
|
||||||
XPath
|
|
||||||
-----
|
|
||||||
|
|
||||||
XPath statements may introduce DoS vulnerabilities. Code should never execute
|
|
||||||
queries from untrusted sources. An attacker may also be able to create an XML
|
|
||||||
document that makes certain XPath queries costly or resource hungry.
|
|
||||||
|
|
||||||
|
|
||||||
XPath injection attacks
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
XPath injeciton attacks pretty much work like SQL injection attacks.
|
|
||||||
Arguments to XPath queries must be quoted and validated properly, especially
|
|
||||||
when they are taken from the user. The page `Avoid the dangers of XPath injection`_
|
|
||||||
list some ramifications of XPath injections.
|
|
||||||
|
|
||||||
Python's standard library doesn't have XPath support. Lxml supports
|
|
||||||
parameterized XPath queries which does proper quoting. You just have to use
|
|
||||||
its xpath() method correctly::
|
|
||||||
|
|
||||||
# DON'T
|
|
||||||
>>> tree.xpath("/tag[@id='%s']" % value)
|
|
||||||
|
|
||||||
# instead do
|
|
||||||
>>> tree.xpath("/tag[@id=$tagid]", tagid=name)
|
|
||||||
|
|
||||||
|
|
||||||
XInclude
|
|
||||||
--------
|
|
||||||
|
|
||||||
`XML Inclusion`_ is another way to load and include external files::
|
|
||||||
|
|
||||||
<root xmlns:xi="http://www.w3.org/2001/XInclude">
|
|
||||||
<xi:include href="filename.txt" parse="text" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
This feature should be disabled when XML files from an untrusted source are
|
|
||||||
processed. Some Python XML libraries and libxml2 support XInclude but don't
|
|
||||||
have an option to sandbox inclusion and limit it to allowed directories.
|
|
||||||
|
|
||||||
|
|
||||||
XMLSchema location
|
|
||||||
------------------
|
|
||||||
|
|
||||||
A validating XML parser may download schema files from the information in a
|
|
||||||
``xsi:schemaLocation`` attribute.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
<ead xmlns="urn:isbn:1-931666-22-9"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="urn:isbn:1-931666-22-9 http://www.loc.gov/ead/ead.xsd">
|
|
||||||
</ead>
|
|
||||||
|
|
||||||
|
|
||||||
XSL Transformation
|
|
||||||
------------------
|
|
||||||
|
|
||||||
You should keep in mind that XSLT is a Turing complete language. Never
|
|
||||||
process XSLT code from unknown or untrusted source! XSLT processors may
|
|
||||||
allow you to interact with external resources in ways you can't even imagine.
|
|
||||||
Some processors even support extensions that allow read/write access to file
|
|
||||||
system, access to JRE objects or scripting with Jython.
|
|
||||||
|
|
||||||
Example from `Attacking XML Security`_ for Xalan-J::
|
|
||||||
|
|
||||||
<xsl:stylesheet version="1.0"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
||||||
xmlns:rt="http://xml.apache.org/xalan/java/java.lang.Runtime"
|
|
||||||
xmlns:ob="http://xml.apache.org/xalan/java/java.lang.Object"
|
|
||||||
exclude-result-prefixes= "rt ob">
|
|
||||||
<xsl:template match="/">
|
|
||||||
<xsl:variable name="runtimeObject" select="rt:getRuntime()"/>
|
|
||||||
<xsl:variable name="command"
|
|
||||||
select="rt:exec($runtimeObject, 'c:\Windows\system32\cmd.exe')"/>
|
|
||||||
<xsl:variable name="commandAsString" select="ob:toString($command)"/>
|
|
||||||
<xsl:value-of select="$commandAsString"/>
|
|
||||||
</xsl:template>
|
|
||||||
</xsl:stylesheet>
|
|
||||||
|
|
||||||
|
|
||||||
Related CVEs
|
|
||||||
============
|
|
||||||
|
|
||||||
CVE-2013-1664
|
|
||||||
Unrestricted entity expansion induces DoS vulnerabilities in Python XML
|
|
||||||
libraries (XML bomb)
|
|
||||||
|
|
||||||
CVE-2013-1665
|
|
||||||
External entity expansion in Python XML libraries inflicts potential
|
|
||||||
security flaws and DoS vulnerabilities
|
|
||||||
|
|
||||||
|
|
||||||
Other languages / frameworks
|
|
||||||
=============================
|
|
||||||
|
|
||||||
Several other programming languages and frameworks are vulnerable as well. A
|
|
||||||
couple of them are affected by the fact that libxml2 up to 2.9.0 has no
|
|
||||||
protection against quadratic blowup attacks. Most of them have potential
|
|
||||||
dangerous default settings for entity expansion and external entities, too.
|
|
||||||
|
|
||||||
Perl
|
|
||||||
----
|
|
||||||
|
|
||||||
Perl's XML::Simple is vulnerable to quadratic entity expansion and external
|
|
||||||
entity expansion (both local and remote).
|
|
||||||
|
|
||||||
|
|
||||||
Ruby
|
|
||||||
----
|
|
||||||
|
|
||||||
Ruby's REXML document parser is vulnerable to entity expansion attacks
|
|
||||||
(both quadratic and exponential) but it doesn't do external entity
|
|
||||||
expansion by default. In order to counteract entity expansion you have to
|
|
||||||
disable the feature::
|
|
||||||
|
|
||||||
REXML::Document.entity_expansion_limit = 0
|
|
||||||
|
|
||||||
libxml-ruby and hpricot don't expand entities in their default configuration.
|
|
||||||
|
|
||||||
|
|
||||||
PHP
|
|
||||||
---
|
|
||||||
|
|
||||||
PHP's SimpleXML API is vulnerable to quadratic entity expansion and loads
|
|
||||||
entities from local and remote resources. The option ``LIBXML_NONET`` disables
|
|
||||||
network access but still allows local file access. ``LIBXML_NOENT`` seems to
|
|
||||||
have no effect on entity expansion in PHP 5.4.6.
|
|
||||||
|
|
||||||
|
|
||||||
C# / .NET / Mono
|
|
||||||
----------------
|
|
||||||
|
|
||||||
Information in `XML DoS and Defenses (MSDN)`_ suggest that .NET is
|
|
||||||
vulnerable with its default settings. The article contains code snippets
|
|
||||||
how to create a secure XML reader::
|
|
||||||
|
|
||||||
XmlReaderSettings settings = new XmlReaderSettings();
|
|
||||||
settings.ProhibitDtd = false;
|
|
||||||
settings.MaxCharactersFromEntities = 1024;
|
|
||||||
settings.XmlResolver = null;
|
|
||||||
XmlReader reader = XmlReader.Create(stream, settings);
|
|
||||||
|
|
||||||
|
|
||||||
Java
|
|
||||||
----
|
|
||||||
|
|
||||||
Untested. The documentation of Xerces and its `Xerces SecurityMananger`_
|
|
||||||
sounds like Xerces is also vulnerable to billion laugh attacks with its
|
|
||||||
default settings. It also does entity resolving when an
|
|
||||||
``org.xml.sax.EntityResolver`` is configured. I'm not yet sure about the
|
|
||||||
default setting here.
|
|
||||||
|
|
||||||
Java specialists suggest to have a custom builder factory::
|
|
||||||
|
|
||||||
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
|
|
||||||
builderFactory.setXIncludeAware(False);
|
|
||||||
builderFactory.setExpandEntityReferences(False);
|
|
||||||
builderFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, True);
|
|
||||||
# either
|
|
||||||
builderFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", True);
|
|
||||||
# or if you need DTDs
|
|
||||||
builderFactory.setFeature("http://xml.org/sax/features/external-general-entities", False);
|
|
||||||
builderFactory.setFeature("http://xml.org/sax/features/external-parameter-entities", False);
|
|
||||||
builderFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", False);
|
|
||||||
builderFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", False);
|
|
||||||
|
|
||||||
|
|
||||||
TODO
|
|
||||||
====
|
|
||||||
|
|
||||||
* DOM: Use xml.dom.xmlbuilder options for entity handling
|
|
||||||
* SAX: take feature_external_ges and feature_external_pes (?) into account
|
|
||||||
* test experimental monkey patching of stdlib modules
|
|
||||||
* improve documentation
|
|
||||||
|
|
||||||
|
|
||||||
License
|
|
||||||
=======
|
|
||||||
|
|
||||||
Copyright (c) 2013-2017 by Christian Heimes <christian@python.org>
|
|
||||||
|
|
||||||
Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
See https://www.python.org/psf/license for licensing details.
|
|
||||||
|
|
||||||
|
|
||||||
Acknowledgements
|
|
||||||
================
|
|
||||||
|
|
||||||
Brett Cannon (Python Core developer)
|
|
||||||
review and code cleanup
|
|
||||||
|
|
||||||
Antoine Pitrou (Python Core developer)
|
|
||||||
code review
|
|
||||||
|
|
||||||
Aaron Patterson, Ben Murphy and Michael Koziarski (Ruby community)
|
|
||||||
Many thanks to Aaron, Ben and Michael from the Ruby community for their
|
|
||||||
report and assistance.
|
|
||||||
|
|
||||||
Thierry Carrez (OpenStack)
|
|
||||||
Many thanks to Thierry for his report to the Python Security Response
|
|
||||||
Team on behalf of the OpenStack security team.
|
|
||||||
|
|
||||||
Carl Meyer (Django)
|
|
||||||
Many thanks to Carl for his report to PSRT on behalf of the Django security
|
|
||||||
team.
|
|
||||||
|
|
||||||
Daniel Veillard (libxml2)
|
|
||||||
Many thanks to Daniel for his insight and assistance with libxml2.
|
|
||||||
|
|
||||||
semantics GmbH (https://www.semantics.de/)
|
|
||||||
Many thanks to my employer semantics for letting me work on the issue
|
|
||||||
during working hours as part of semantics's open source initiative.
|
|
||||||
|
|
||||||
|
|
||||||
References
|
|
||||||
==========
|
|
||||||
|
|
||||||
* `XML DoS and Defenses (MSDN)`_
|
|
||||||
* `Billion Laughs`_ on Wikipedia
|
|
||||||
* `ZIP bomb`_ on Wikipedia
|
|
||||||
* `Configure SAX parsers for secure processing`_
|
|
||||||
* `Testing for XML Injection`_
|
|
||||||
|
|
||||||
.. _defusedxml package: https://github.com/tiran/defusedxml
|
|
||||||
.. _defusedxml on PyPI: https://pypi.python.org/pypi/defusedxml
|
|
||||||
.. _defusedexpat package: https://github.com/tiran/defusedexpat
|
|
||||||
.. _defusedexpat on PyPI: https://pypi.python.org/pypi/defusedexpat
|
|
||||||
.. _modified expat: https://github.com/tiran/expat
|
|
||||||
.. _expat parser: http://expat.sourceforge.net/
|
|
||||||
.. _Attacking XML Security: https://www.isecpartners.com/media/12976/iSEC-HILL-Attacking-XML-Security-bh07.pdf
|
|
||||||
.. _Billion Laughs: https://en.wikipedia.org/wiki/Billion_laughs
|
|
||||||
.. _XML DoS and Defenses (MSDN): https://msdn.microsoft.com/en-us/magazine/ee335713.aspx
|
|
||||||
.. _ZIP bomb: https://en.wikipedia.org/wiki/Zip_bomb
|
|
||||||
.. _DTD: https://en.wikipedia.org/wiki/Document_Type_Definition
|
|
||||||
.. _PI: https://en.wikipedia.org/wiki/Processing_Instruction
|
|
||||||
.. _Avoid the dangers of XPath injection: http://www.ibm.com/developerworks/xml/library/x-xpathinjection/index.html
|
|
||||||
.. _Configure SAX parsers for secure processing: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
|
|
||||||
.. _Testing for XML Injection: https://www.owasp.org/index.php/Testing_for_XML_Injection_(OWASP-DV-008)
|
|
||||||
.. _Xerces SecurityMananger: https://xerces.apache.org/xerces2-j/javadocs/xerces2/org/apache/xerces/util/SecurityManager.html
|
|
||||||
.. _XML Inclusion: https://www.w3.org/TR/xinclude/#include_element
|
|
||||||
|
|
||||||
Changelog
|
|
||||||
=========
|
|
||||||
|
|
||||||
defusedxml 0.7.1
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
*Release date: 08-Mar-2021*
|
|
||||||
|
|
||||||
- Fix regression ``defusedxml.ElementTree.ParseError`` (#63)
|
|
||||||
The ``ParseError`` exception is now the same class object as
|
|
||||||
``xml.etree.ElementTree.ParseError`` again.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.7.0
|
|
||||||
----------------
|
|
||||||
|
|
||||||
*Release date: 4-Mar-2021*
|
|
||||||
|
|
||||||
- No changes
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.7.0rc2
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
*Release date: 12-Jan-2021*
|
|
||||||
|
|
||||||
- Re-add and deprecate ``defusedxml.cElementTree``
|
|
||||||
- Use GitHub Actions instead of TravisCI
|
|
||||||
- Restore ``ElementTree`` attribute of ``xml.etree`` module after patching
|
|
||||||
|
|
||||||
defusedxml 0.7.0rc1
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
*Release date: 04-May-2020*
|
|
||||||
|
|
||||||
- Add support for Python 3.9
|
|
||||||
- ``defusedxml.cElementTree`` is not available with Python 3.9.
|
|
||||||
- Python 2 is deprecate. Support for Python 2 will be removed in 0.8.0.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.6.0
|
|
||||||
----------------
|
|
||||||
|
|
||||||
*Release date: 17-Apr-2019*
|
|
||||||
|
|
||||||
- Increase test coverage.
|
|
||||||
- Add badges to README.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.6.0rc1
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
*Release date: 14-Apr-2019*
|
|
||||||
|
|
||||||
- Test on Python 3.7 stable and 3.8-dev
|
|
||||||
- Drop support for Python 3.4
|
|
||||||
- No longer pass *html* argument to XMLParse. It has been deprecated and
|
|
||||||
ignored for a long time. The DefusedXMLParser still takes a html argument.
|
|
||||||
A deprecation warning is issued when the argument is False and a TypeError
|
|
||||||
when it's True.
|
|
||||||
- defusedxml now fails early when pyexpat stdlib module is not available or
|
|
||||||
broken.
|
|
||||||
- defusedxml.ElementTree.__all__ now lists ParseError as public attribute.
|
|
||||||
- The defusedxml.ElementTree and defusedxml.cElementTree modules had a typo
|
|
||||||
and used XMLParse instead of XMLParser as an alias for DefusedXMLParser.
|
|
||||||
Both the old and fixed name are now available.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.5.0
|
|
||||||
----------------
|
|
||||||
|
|
||||||
*Release date: 07-Feb-2017*
|
|
||||||
|
|
||||||
- No changes
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.5.0.rc1
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
*Release date: 28-Jan-2017*
|
|
||||||
|
|
||||||
- Add compatibility with Python 3.6
|
|
||||||
- Drop support for Python 2.6, 3.1, 3.2, 3.3
|
|
||||||
- Fix lxml tests (XMLSyntaxError: Detected an entity reference loop)
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.4.1
|
|
||||||
----------------
|
|
||||||
|
|
||||||
*Release date: 28-Mar-2013*
|
|
||||||
|
|
||||||
- Add more demo exploits, e.g. python_external.py and Xalan XSLT demos.
|
|
||||||
- Improved documentation.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.4
|
|
||||||
--------------
|
|
||||||
|
|
||||||
*Release date: 25-Feb-2013*
|
|
||||||
|
|
||||||
- As per http://seclists.org/oss-sec/2013/q1/340 please REJECT
|
|
||||||
CVE-2013-0278, CVE-2013-0279 and CVE-2013-0280 and use CVE-2013-1664,
|
|
||||||
CVE-2013-1665 for OpenStack/etc.
|
|
||||||
- Add missing parser_list argument to sax.make_parser(). The argument is
|
|
||||||
ignored, though. (thanks to Florian Apolloner)
|
|
||||||
- Add demo exploit for external entity attack on Python's SAX parser, XML-RPC
|
|
||||||
and WebDAV.
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.3
|
|
||||||
--------------
|
|
||||||
|
|
||||||
*Release date: 19-Feb-2013*
|
|
||||||
|
|
||||||
- Improve documentation
|
|
||||||
|
|
||||||
|
|
||||||
defusedxml 0.2
|
|
||||||
--------------
|
|
||||||
|
|
||||||
*Release date: 15-Feb-2013*
|
|
||||||
|
|
||||||
- Rename ExternalEntitiesForbidden to ExternalReferenceForbidden
|
|
||||||
- Rename defusedxml.lxml.check_dtd() to check_docinfo()
|
|
||||||
- Unify argument names in callbacks
|
|
||||||
- Add arguments and formatted representation to exceptions
|
|
||||||
- Add forbid_external argument to all functions and classes
|
|
||||||
- More tests
|
|
||||||
- LOTS of documentation
|
|
||||||
- Add example code for other languages (Ruby, Perl, PHP) and parsers (Genshi)
|
|
||||||
- Add protection against XML and gzip attacks to xmlrpclib
|
|
||||||
|
|
||||||
defusedxml 0.1
|
|
||||||
--------------
|
|
||||||
|
|
||||||
*Release date: 08-Feb-2013*
|
|
||||||
|
|
||||||
- Initial and internal release for PSRT review
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
defusedxml-0.7.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
|
||||||
defusedxml-0.7.1.dist-info/LICENSE,sha256=uAzp2oxCofkQeWJ_u-K_JyEK4Qig_-Xwd9WwjgdsJMg,2409
|
|
||||||
defusedxml-0.7.1.dist-info/METADATA,sha256=Np0872SHDa-En7pxHLjQWn7-PI2asPdjrcNAef43i7E,32518
|
|
||||||
defusedxml-0.7.1.dist-info/RECORD,,
|
|
||||||
defusedxml-0.7.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
||||||
defusedxml-0.7.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
|
||||||
defusedxml-0.7.1.dist-info/top_level.txt,sha256=QGHa90F50pVKhWSFlERI0jtSKtqDiGyfeZX7dQNZAAw,11
|
|
||||||
defusedxml/ElementTree.py,sha256=GLSqpCz58oXGPGyzf_HylsPS9_dcGVP5SN4dK7yvyPw,4640
|
|
||||||
defusedxml/__init__.py,sha256=RczeaVJG64p2Fgy1jlCzbuRdchEPnEaCBrxgk8JJ_pM,1444
|
|
||||||
defusedxml/cElementTree.py,sha256=PpaKMh3rU29sY8amAK4fzHQKl8gcAYD0h1LCoW62Rtk,1449
|
|
||||||
defusedxml/common.py,sha256=3d26jNW4fNXzgjWhvUfs83Afiz5EVxFDupQbugkSMZc,4036
|
|
||||||
defusedxml/expatbuilder.py,sha256=b4Q05vsBMJ5StkiTFf4my2rGGo1gZyEl_hC5MeFTOAA,3732
|
|
||||||
defusedxml/expatreader.py,sha256=KOpSrwkSvj5SGOY9pTXOM26Dnz00rsJt33WueVvzpvc,2196
|
|
||||||
defusedxml/lxml.py,sha256=HW-LFKdrfMRzHdi0Vcucq4-n8yz7v_OQwEQWFg1JQYA,4940
|
|
||||||
defusedxml/minidom.py,sha256=3QcgygVwJqcWDQ3IZ2iol8zsH4cx3BRX70SPcd0bG2g,1884
|
|
||||||
defusedxml/pulldom.py,sha256=DYj2D2lc7xoxZ38gfzujXmdznd8ovzDqGFXqyXbtxjk,1170
|
|
||||||
defusedxml/sax.py,sha256=-SF08Msc2mWEYAMw62pJ5FMwWccOctFSnQwDLYLLlVE,1477
|
|
||||||
defusedxml/xmlrpc.py,sha256=7rZQey3tqXcc1hrrM3RprOICU6fiFny9B9l4nmTioxA,5364
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue